From 2b49ea6703382c1f31a536476bc2571768cd8ece Mon Sep 17 00:00:00 2001 From: Dongyang Geng Date: Tue, 17 Mar 2026 17:34:59 +0800 Subject: [PATCH 1/7] fix: guard against tail/tailUsed race in TileVisibility GC and query paths --- cpp/pixels-retina/lib/TileVisibility.cpp | 35 ++++ cpp/pixels-retina/test/TileVisibilityTest.cpp | 190 ++++++++++++++++++ .../pixels/retina/RetinaResourceManager.java | 73 +++++-- 3 files changed, 278 insertions(+), 20 deletions(-) diff --git a/cpp/pixels-retina/lib/TileVisibility.cpp b/cpp/pixels-retina/lib/TileVisibility.cpp index 1123032e2..4d9480802 100644 --- a/cpp/pixels-retina/lib/TileVisibility.cpp +++ b/cpp/pixels-retina/lib/TileVisibility.cpp @@ -199,14 +199,33 @@ void TileVisibility::getTileVisibilityBitmap(uint64_t ts, uint64_t* ou size_t currentTailUsed = tailUsed.load(std::memory_order_relaxed); size_t count = (blk == currentTail) ? currentTailUsed : DeleteIndexBlock::BLOCK_CAPACITY; + // Same tail/tailUsed race as in collectTileGarbage: count may be 0 or + // a stale BLOCK_CAPACITY for a newly-created tail block. count == 0 + // means no items to read; skip cleanly. The stale-count case (items + // beyond the first being zero-initialised) is handled in the scalar + // path below via the item == 0 sentinel check. + if (count == 0) { + blk = blk->next.load(std::memory_order_relaxed); + continue; + } + uint64_t i = 0; #ifdef RETINA_SIMD + // NOTE: the SIMD path does not check for zero-initialised (item == 0) + // sentinel values. In the extremely rare stale-tailUsed race window, + // up to BLOCK_CAPACITY-1 zero items may cause row 0 to be transiently + // marked as deleted in the output bitmap. This is a known limitation + // of the SIMD fast path; the effect is transient (not persisted) and + // self-correcting on the next query once tailUsed is fully updated. for (; i + 4 <= count; i += 4) { process_bitmap_block_256(blk, i, outBitmap, vThrFlip, tsMask, signBit); } #endif for (; i < count; i++) { uint64_t item = blk->items[i]; + // Sentinel: zero item signals an uninitialised slot (see + // collectTileGarbage for the full race description). + if (item == 0) return; if (extractTimestamp(item) <= ts) { SET_BITMAP_BIT(outBitmap, extractRowId(item)); } else { @@ -232,6 +251,14 @@ void TileVisibility::collectTileGarbage(uint64_t ts) { size_t count = (blk == tail.load(std::memory_order_acquire)) ? tailUsed.load(std::memory_order_acquire) : DeleteIndexBlock::BLOCK_CAPACITY; + // Guard: deleteTileRecord updates `tail` and `tailUsed` non-atomically. + // In the narrow window after `tail` is advanced to a new block but before + // `tailUsed.store(1)` completes, we may observe count == 0 (empty-list + // path: tailUsed transitions 0 → 1) or a stale BLOCK_CAPACITY (full-block + // path: tailUsed transitions BLOCK_CAPACITY → 1 via store, not CAS). + // When count == 0 there is nothing to compact; stop here and let the next + // GC cycle handle the block once it is fully initialised. + if (count == 0) break; uint64_t lastItemTs = extractTimestamp(blk->items[count - 1]); if (lastItemTs <= ts) { lastFullBlk = blk; @@ -253,6 +280,14 @@ void TileVisibility::collectTileGarbage(uint64_t ts) { size_t count = (blk == lastFullBlk && blk == tail.load()) ? tailUsed.load() : DeleteIndexBlock::BLOCK_CAPACITY; for (size_t i = 0; i < count; i++) { uint64_t item = blk->items[i]; + // Guard: a zero item means an uninitialised slot in a newly-created + // tail block observed under the same tail/tailUsed race described + // above (full-block path: tailUsed is still BLOCK_CAPACITY while + // only items[0] is valid; items[1..n] remain zero-initialised). + // item == 0 encodes makeDeleteIndex(rowId=0, ts=0); since all valid + // transaction timestamps are > 0, this value is never a legitimate + // deletion record and safely identifies the end of valid items. + if (item == 0) break; if (extractTimestamp(item) <= ts) { SET_BITMAP_BIT(newBaseBitmap, extractRowId(item)); } diff --git a/cpp/pixels-retina/test/TileVisibilityTest.cpp b/cpp/pixels-retina/test/TileVisibilityTest.cpp index 83fe099bb..60fdcc802 100644 --- a/cpp/pixels-retina/test/TileVisibilityTest.cpp +++ b/cpp/pixels-retina/test/TileVisibilityTest.cpp @@ -268,3 +268,193 @@ TEST_F(TileVisibilityTest, MultiThread) { EXPECT_TRUE(checkBitmap(finalBitmap, expectedFinalBitmap)); } + +/** + * ZeroSentinelInGarbageCollect — deterministic regression for Scenario 2 guard. + * + * The fix for the full-block race (Scenario 2) relies on treating item=0 as a + * sentinel that marks uninitialised tail slots. makeDeleteIndex(rowId=0, ts=0)=0, + * which is identical to the zero-initialised memory of a freshly-allocated block. + * + * Precondition enforced by TransService: all valid transaction timestamps are > 0, + * so ts=0 can never represent a real deletion and is safe to use as a sentinel. + * + * This test simulates the exact item value produced by the race without requiring + * concurrent execution: + * 1. Fill BLOCK_CAPACITY-1 slots with valid (rowId, ts) pairs. + * 2. Insert makeDeleteIndex(0,0)=0 into the last slot — the same value a + * zero-initialised slot in a new block would have during the race window. + * 3. Run GC: without the fix, extractTimestamp(0)=0 ≤ ts would SET_BITMAP_BIT(0). + * with the fix, `if (item == 0) break` stops before touching bit 0. + * + * Failure mode WITHOUT fix: bits 0..BLOCK_CAPACITY-2 set (bit 0 is spurious). + * Pass condition WITH fix: bits 1..BLOCK_CAPACITY-2 set, bit 0 NOT set. + */ +TEST_F(TileVisibilityTest, ZeroSentinelInGarbageCollect) { + // Fill slots 0..BLOCK_CAPACITY-2 with valid items (rows 1..7, ts 1..7) + for (uint16_t i = 1; i < DeleteIndexBlock::BLOCK_CAPACITY; i++) { + v->deleteTileRecord(i, static_cast(i)); + } + // Insert the sentinel value (row=0, ts=0 → item=0) into the final slot. + // This replicates the zero-initialised items[1..7] that GC would encounter + // during the Scenario-2 race if tailUsed were stale at BLOCK_CAPACITY. + v->deleteTileRecord(0, 0); + + v->collectTileGarbage(100); + + uint64_t actualBitmap[BITMAP_SIZE] = {0}; + v->getTileVisibilityBitmap(100, actualBitmap); + + // Rows 1..(BLOCK_CAPACITY-1) should be deleted; row 0 must NOT be set. + uint64_t expectedBitmap[BITMAP_SIZE] = {0}; + for (uint16_t i = 1; i < DeleteIndexBlock::BLOCK_CAPACITY; i++) { + SET_BITMAP_BIT(expectedBitmap, i); + } + EXPECT_TRUE(checkBitmap(actualBitmap, expectedBitmap)) + << "Row 0 must not be set: item==0 sentinel guard must stop GC " + "before processing zero-initialised (or ts=0) slots"; +} + +/** + * ConcurrentGCAndFirstInsert — targets Scenario 1 (empty-list path race). + * + * Race condition: + * deleteTileRecord (empty list) does: + * 1. tail.CAS(nullptr → newBlk) + * 2. currentVersion.CAS(oldVer → newVer with head=newBlk) ← head now visible + * 3. tailUsed.store(1) ← window: 2 done, 3 not yet + * + * If collectTileGarbage runs between steps 2 and 3: + * - blk = newVer->head = newBlk (reachable) + * - count = tailUsed = 0 (not yet updated) + * - BEFORE FIX: items[count-1] = items[size_t(-1)] → size_t underflow → UB / crash + * - AFTER FIX: count==0 guard breaks out safely; no crash. + * + * NOTE on test reliability: the race window is between two adjacent atomic operations + * (currentVersion.CAS at line ~99 and tailUsed.store at line ~102 in deleteTileRecord). + * This is too narrow to trigger reliably with OS-level scheduling alone; the test is + * therefore a probabilistic stress test rather than a deterministic reproducer. For + * guaranteed detection, compile with AddressSanitizer + ThreadSanitizer or add a + * -DENABLE_TEST_HOOKS build flag that injects a sleep between the two operations. + * + * The primary value of this test is as a no-crash regression guard: if the count==0 + * guard is removed, a crash (size_t underflow → OOB array access) will eventually + * surface under sustained concurrent load even if it is not triggered every run. + */ +TEST_F(TileVisibilityTest, ConcurrentGCAndFirstInsert) { + constexpr int TRIALS = 200; + + for (int trial = 0; trial < TRIALS; trial++) { + delete v; + v = new TileVisibility(); + + std::atomic deleteStarted{false}; + std::atomic gcDone{false}; + + // GC thread: spin-waits until the delete thread has signalled it started, + // then immediately fires GC to maximise the chance of hitting the race window. + auto gcThread = std::thread([&]() { + while (!deleteStarted.load(std::memory_order_acquire)) {} + v->collectTileGarbage(1000); + gcDone.store(true, std::memory_order_release); + }); + + // Delete thread: signals start, then inserts the very first item (row=5, ts=100). + // Row 0 is intentionally never deleted so we can use bit 0 as a spurious-set + // canary in the companion scenario-2 test. + deleteStarted.store(true, std::memory_order_release); + v->deleteTileRecord(5, 100); + + gcThread.join(); + + // After both operations complete, GC with a ts that covers the inserted item + // and verify the bitmap is exactly {row 5 deleted}. + v->collectTileGarbage(1000); + uint64_t actualBitmap[BITMAP_SIZE] = {0}; + v->getTileVisibilityBitmap(1000, actualBitmap); + + uint64_t expectedBitmap[BITMAP_SIZE] = {0}; + SET_BITMAP_BIT(expectedBitmap, 5); + + EXPECT_TRUE(checkBitmap(actualBitmap, expectedBitmap)) + << "Trial " << trial << ": bitmap incorrect after concurrent first-insert + GC"; + } +} + +/** + * ConcurrentGCAndBlockTransition — targets Scenario 2 (full-block path race). + * + * Race condition: + * deleteTileRecord (old tail block is full) does: + * 1. curTail->next.CAS(nullptr → newBlk) + * 2. tail.CAS(curTail → newBlk) ← tail now points to new block + * 3. tailUsed.store(1) ← window: 2 done, 3 not yet + * + * If collectTileGarbage runs between steps 2 and 3: + * - blk == tail (newBlk), count = tailUsed = BLOCK_CAPACITY (stale old value, 8) + * - items[0] is the real insertion; items[1..BLOCK_CAPACITY-1] are zero-initialised + * - BEFORE FIX: extractTimestamp(0)=0 ≤ ts → SET_BITMAP_BIT(extractRowId(0)=0) + * → bit 0 spuriously set in baseBitmap (persistent data corruption) + * - AFTER FIX: item==0 guard breaks the inner loop; no spurious bit 0. + * + * Strategy: pre-fill exactly BLOCK_CAPACITY items (one full block) with ts values + * that GC will compact, then concurrently fire GC and the (BLOCK_CAPACITY+1)-th + * insert that triggers new-block creation. Row 0 is never deleted; if the bug fires, + * getTileVisibilityBitmap will report bit 0 set even though row 0 was never deleted. + * + * NOTE on test reliability: identical narrow-window caveat as ConcurrentGCAndFirstInsert. + * The ZeroSentinelInGarbageCollect test above is the deterministic companion that + * verifies the item==0 guard logic directly without requiring concurrent execution. + */ +TEST_F(TileVisibilityTest, ConcurrentGCAndBlockTransition) { + constexpr uint64_t GC_TS = 1000; + // Number of concurrent trials; more iterations → higher probability of hitting the race. + constexpr int TRIALS = 500; + + std::atomic spuriousRow0{false}; + + for (int trial = 0; trial < TRIALS && !spuriousRow0.load(); trial++) { + delete v; + v = new TileVisibility(); + + // Pre-fill exactly BLOCK_CAPACITY (8) items so the next insert triggers + // the full-block → new-block code path. Use rows 1..8 (never row 0). + for (size_t i = 0; i < DeleteIndexBlock::BLOCK_CAPACITY; i++) { + v->deleteTileRecord(static_cast(i + 1), i + 1); + } + + std::atomic insertReady{false}; + + // GC thread: waits for the insert thread to be about to create the new block, + // then fires GC immediately to race with tail/tailUsed update. + auto gcThread = std::thread([&]() { + while (!insertReady.load(std::memory_order_acquire)) {} + v->collectTileGarbage(GC_TS); + }); + + // Insert thread: signal then insert the (BLOCK_CAPACITY+1)-th item to force + // new-block creation. Row 0 is the canary — never intentionally deleted. + insertReady.store(true, std::memory_order_release); + v->deleteTileRecord(10, DeleteIndexBlock::BLOCK_CAPACITY + 1); + + gcThread.join(); + + // Run one more clean GC to ensure everything that should be compacted is. + v->collectTileGarbage(GC_TS); + + // Check the canary: bit 0 must be 0 because row 0 was never deleted. + uint64_t bitmap[BITMAP_SIZE] = {0}; + v->getTileVisibilityBitmap(GC_TS, bitmap); + + if (GET_BITMAP_BIT(bitmap, 0)) { + spuriousRow0.store(true); + ADD_FAILURE() << "Trial " << trial + << ": bit 0 spuriously set in bitmap — " + << "stale tailUsed race bug triggered (Scenario 2)"; + } + } + + EXPECT_FALSE(spuriousRow0.load()) + << "Row 0 was spuriously marked deleted by GC processing " + "zero-initialised slots of a newly created tail block."; +} diff --git a/pixels-retina/src/main/java/io/pixelsdb/pixels/retina/RetinaResourceManager.java b/pixels-retina/src/main/java/io/pixelsdb/pixels/retina/RetinaResourceManager.java index 56b78883e..04f4c951d 100644 --- a/pixels-retina/src/main/java/io/pixelsdb/pixels/retina/RetinaResourceManager.java +++ b/pixels-retina/src/main/java/io/pixelsdb/pixels/retina/RetinaResourceManager.java @@ -323,17 +323,18 @@ private CompletableFuture createCheckpoint(long timestamp, CheckpointType // 4. Async Write: perform IO in background thread (Consumer) // Use commonPool to avoid deadlocks with checkpointExecutor return CompletableFuture.runAsync(() -> { - // Lock on filePath string intern to ensure only one thread writes to the same file + // Lock on filePath string intern to ensure only one thread writes to the same file. + // For OFFLOAD type this guards against concurrent requests for the same timestamp. + // For GC type runGC() is single-threaded, so the lock is a no-op in practice. synchronized (filePath.intern()) { + // Deduplicate concurrent OFFLOAD checkpoint requests for the same timestamp. + // GC type deduplication is handled upstream in runGC() via the + // `timestamp <= latestGcTimestamp` guard, so no check is needed here. if (type == CheckpointType.OFFLOAD && offloadedCheckpoints.containsKey(timestamp)) { return; } - if (type == CheckpointType.GC && timestamp <= latestGcTimestamp) - { - return; - } long startWrite = System.currentTimeMillis(); try @@ -346,15 +347,10 @@ private CompletableFuture createCheckpoint(long timestamp, CheckpointType if (type == CheckpointType.OFFLOAD) { offloadedCheckpoints.put(timestamp, filePath); - } else - { - long oldGcTs = this.latestGcTimestamp; - this.latestGcTimestamp = timestamp; - if (oldGcTs != -1 && oldGcTs != timestamp) - { - removeCheckpointFile(oldGcTs, CheckpointType.GC); - } } + // GC type: latestGcTimestamp update and old checkpoint cleanup are + // handled by runGC() after the full GC cycle (Memory GC + checkpoint + // + Storage GC) completes successfully. Do not update here. } catch (Exception e) { logger.error("Failed to commit {} checkpoint file for timestamp: {}", type, timestamp, e); @@ -592,7 +588,30 @@ private PixelsWriteBuffer checkPixelsWriteBuffer(String schema, String table, in } /** - * Run garbage collection on all registered RGVisibility. + * Run a full GC cycle: Memory GC → checkpoint → Storage GC (future). + * + *

Ordering rationale: + *

    + *
  1. Memory GC first: {@code collectTileGarbage} compacts Deletion Chain blocks + * whose last item ts ≤ lwm into {@code baseBitmap}. After compaction, the remaining + * chain starts at the first block that straddles the lwm boundary, so the subsequent + * {@code getVisibilityBitmap(lwm)} call traverses at most one partial block + * (≤ {@code BLOCK_CAPACITY} items) instead of the entire pre-GC chain. This makes + * checkpoint bitmap serialisation significantly cheaper.
  2. + *
  3. Checkpoint second, blocking: we call {@code .join()} so that + * {@code runGC()} does not return until the checkpoint file is fully written to disk. + * {@code gcExecutor} is a single-threaded scheduler; because {@code runGC()} must + * complete before the next invocation begins, this blocking join is the simplest way + * to guarantee that no two GC cycles ever overlap — no additional lock is required.
  4. + *
  5. Storage GC third: requires an up-to-date {@code baseBitmap} (hence after + * Memory GC) and its own WAL for crash recovery. Placing it after the checkpoint + * keeps the two recovery paths independent: on restart, the GC checkpoint restores + * the post-Memory-GC visibility state, and the GcWal resumes any in-progress Storage + * GC task separately.
  6. + *
  7. Advance {@code latestGcTimestamp} last: updated only after the entire cycle + * succeeds (Memory GC + checkpoint + Storage GC). If any step throws, the timestamp + * is not advanced and the next scheduled invocation will retry the full cycle.
  8. + *
*/ private void runGC() { @@ -613,13 +632,27 @@ private void runGC() try { - // 1. Persist first - createCheckpoint(timestamp, CheckpointType.GC); - // 2. Then clean memory - for (Map.Entry entry: this.rgVisibilityMap.entrySet()) + // Step 1: Memory GC — compact Deletion Chain into baseBitmap. + for (Map.Entry entry : this.rgVisibilityMap.entrySet()) + { + entry.getValue().garbageCollect(timestamp); + } + + // Step 2: Persist the post-Memory-GC visibility state. Block until the + // checkpoint file is fully written (see Javadoc above for why we join here). + createCheckpoint(timestamp, CheckpointType.GC).join(); + + // Step 3: Storage GC — rewrite high-deletion-ratio files (TODO). + // if (storageGcEnabled) storageGarbageCollector.runStorageGC(); + + // Step 4: Advance the timestamp only after the full cycle succeeds. + // latestGcTimestamp is no longer updated inside createCheckpoint's async + // callback for GC type; this is the single authoritative update point. + long oldGcTs = this.latestGcTimestamp; + this.latestGcTimestamp = timestamp; + if (oldGcTs != -1 && oldGcTs != timestamp) { - RGVisibility rgVisibility = entry.getValue(); - rgVisibility.garbageCollect(timestamp); + removeCheckpointFile(oldGcTs, CheckpointType.GC); } } catch (Exception e) { From ce9b15b79fca760dbf927f6e27283cbc00ad67f9 Mon Sep 17 00:00:00 2001 From: Dongyang Geng Date: Sat, 21 Mar 2026 14:27:41 +0800 Subject: [PATCH 2/7] fix: TOCTOU race condition in addVisibility --- .../io/pixelsdb/pixels/retina/RetinaResourceManager.java | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/pixels-retina/src/main/java/io/pixelsdb/pixels/retina/RetinaResourceManager.java b/pixels-retina/src/main/java/io/pixelsdb/pixels/retina/RetinaResourceManager.java index 04f4c951d..e2e37bbc2 100644 --- a/pixels-retina/src/main/java/io/pixelsdb/pixels/retina/RetinaResourceManager.java +++ b/pixels-retina/src/main/java/io/pixelsdb/pixels/retina/RetinaResourceManager.java @@ -139,12 +139,7 @@ public static RetinaResourceManager Instance() public void addVisibility(long fileId, int rgId, int recordNum) { String rgKey = fileId + "_" + rgId; - if (rgVisibilityMap.containsKey(rgKey)) - { - return; - } - - rgVisibilityMap.put(rgKey, new RGVisibility(recordNum)); + rgVisibilityMap.computeIfAbsent(rgKey, k -> new RGVisibility(recordNum)); } public void addVisibility(String filePath) throws RetinaException From e331b2630ffe16879c627064ed52e597de9d98f7 Mon Sep 17 00:00:00 2001 From: Dongyang Geng Date: Sat, 21 Mar 2026 16:51:47 +0800 Subject: [PATCH 3/7] fix: use-after-free in deleteTileRecord empty-chain path --- cpp/pixels-retina/include/TileVisibility.h | 17 ++++++++++++++- cpp/pixels-retina/lib/TileVisibility.cpp | 24 ++++++++++++++-------- 2 files changed, 32 insertions(+), 9 deletions(-) diff --git a/cpp/pixels-retina/include/TileVisibility.h b/cpp/pixels-retina/include/TileVisibility.h index a7ac6c790..51bf11351 100644 --- a/cpp/pixels-retina/include/TileVisibility.h +++ b/cpp/pixels-retina/include/TileVisibility.h @@ -108,7 +108,22 @@ class TileVisibility : public pixels::RetinaBase> { std::atomic*> currentVersion; std::atomic tail; std::atomic tailUsed; - std::vector> retired; // Protected by GC (single writer) + + // Retired versions awaiting epoch-based reclamation. Only the GC thread + // (collectTileGarbage / reclaimRetiredVersions) reads and writes this vector, + // so no locking is needed. + std::vector> retired; + + // Lock-free staging slot between deleteTileRecord (CDC threads) and GC. + // deleteTileRecord's empty-chain path replaces currentVersion but cannot + // write `retired` directly — that would race with the GC thread. Instead + // it atomically stores oldVer here. The GC thread drains this slot at the + // start of collectTileGarbage, moving it into `retired` with a proper epoch. + // Flow: deleteTileRecord → pendingRetire.store → collectTileGarbage → + // pendingRetire.exchange(nullptr) → retired.emplace_back → reclaimRetiredVersions + // At most one version is pending per GC cycle (the empty-chain path fires + // at most once between consecutive GC compactions). + std::atomic*> pendingRetire{nullptr}; }; #endif // PIXELS_RETINA_TILE_VISIBILITY_H diff --git a/cpp/pixels-retina/lib/TileVisibility.cpp b/cpp/pixels-retina/lib/TileVisibility.cpp index 4d9480802..3195d7aa3 100644 --- a/cpp/pixels-retina/lib/TileVisibility.cpp +++ b/cpp/pixels-retina/lib/TileVisibility.cpp @@ -26,12 +26,6 @@ #include -#include "TileVisibility.h" -#include "EpochManager.h" -#include -#include -#include - template TileVisibility::TileVisibility() { VersionedData* initialVersion = new VersionedData(); @@ -61,6 +55,12 @@ TileVisibility::~TileVisibility() { delete ver; } + // Clean up any version left in the pending retirement slot + VersionedData* pending = pendingRetire.load(std::memory_order_acquire); + if (pending) { + delete pending; + } + // Clean up retired versions and their delete chains for (auto& retired : this->retired) { if (retired.data) { @@ -97,8 +97,9 @@ void TileVisibility::deleteTileRecord(uint16_t rowId, uint64_t ts) { VersionedData* newVer = new VersionedData(oldVer->baseTimestamp, oldVer->baseBitmap, newBlk); if (currentVersion.compare_exchange_strong(oldVer, newVer, std::memory_order_acq_rel)) { - // Success: retire old version (no chain to delete since head was nullptr) - delete oldVer; + // Defer retirement: a concurrent reader may still hold oldVer under EpochGuard. + // collectTileGarbage will drain this slot and epoch-retire it properly. + pendingRetire.store(oldVer, std::memory_order_release); tailUsed.store(1, std::memory_order_release); return; } else { @@ -238,6 +239,13 @@ void TileVisibility::getTileVisibilityBitmap(uint64_t ts, uint64_t* ou template void TileVisibility::collectTileGarbage(uint64_t ts) { + // Drain the pending retirement slot left by deleteTileRecord's empty-chain path. + VersionedData* pending = pendingRetire.exchange(nullptr, std::memory_order_acquire); + if (pending) { + uint64_t retireEpoch = EpochManager::getInstance().advanceEpoch(); + retired.emplace_back(pending, nullptr, retireEpoch); + } + // Load old version VersionedData* oldVer = currentVersion.load(std::memory_order_acquire); if (ts <= oldVer->baseTimestamp) return; From 8327c5c9928dbef2a6fbf74d6139d20291318b7c Mon Sep 17 00:00:00 2001 From: Dongyang Geng Date: Sat, 21 Mar 2026 22:42:29 +0800 Subject: [PATCH 4/7] fix: written files on Retina are uniformly formatted in ordered --- .../common/utils/PixelsFileNameUtils.java | 25 ++++++------------- .../pixels/retina/FileWriterManager.java | 2 +- 2 files changed, 8 insertions(+), 19 deletions(-) diff --git a/pixels-common/src/main/java/io/pixelsdb/pixels/common/utils/PixelsFileNameUtils.java b/pixels-common/src/main/java/io/pixelsdb/pixels/common/utils/PixelsFileNameUtils.java index 621b54c20..67586f7dd 100644 --- a/pixels-common/src/main/java/io/pixelsdb/pixels/common/utils/PixelsFileNameUtils.java +++ b/pixels-common/src/main/java/io/pixelsdb/pixels/common/utils/PixelsFileNameUtils.java @@ -43,8 +43,8 @@ *

File types

* * - * - * + * * * * @@ -80,7 +80,6 @@ public final class PixelsFileNameUtils */ public enum PxlFileType { - RETINA("retina"), ORDERED("ordered"), COMPACT("compact"), SINGLE("single"), @@ -131,11 +130,11 @@ public static PxlFileType fromLabel(String label) *
  • timestamp — exactly 14 digits (yyyyMMddHHmmss)
  • *
  • atomicCount
  • *
  • virtualNodeId — non-negative integer, or {@code -1} for single files
  • - *
  • type label — one of {@code retina|ordered|compact|single|copy}
  • + *
  • type label — one of {@code ordered|compact|single|copy}
  • * */ private static final Pattern PXL_PATTERN = Pattern.compile( - "(?:.*/)?(.+)_(\\d{14})_(\\d+)_(-?\\d+)_(retina|ordered|compact|single|copy)\\.pxl$"); + "(?:.*/)?(.+)_(\\d{14})_(\\d+)_(-?\\d+)_(ordered|compact|single|copy)\\.pxl$"); private PixelsFileNameUtils() {} @@ -161,16 +160,7 @@ public static String buildPxlFileName(String hostName, int virtualNodeId, PxlFil // ------------------------------------------------------------------------- /** - * Builds a Retina file name (CDC real-time write path). - *

    Format: {@code ____retina.pxl} - */ - public static String buildRetinaFileName(String hostName, int virtualNodeId) - { - return buildPxlFileName(hostName, virtualNodeId, PxlFileType.RETINA); - } - - /** - * Builds an Ordered file name (indexed batch load). + * Builds an Ordered file name (CDC real-time write path and indexed batch load). *

    Format: {@code ____ordered.pxl} */ public static String buildOrderedFileName(String hostName, int virtualNodeId) @@ -281,8 +271,7 @@ public static PxlFileType extractFileType(String path) /** * Returns {@code true} if the file at {@code path} is eligible for Storage GC, - * i.e. its type is one of {@link PxlFileType#RETINA}, {@link PxlFileType#ORDERED}, - * or {@link PxlFileType#COMPACT}. + * i.e. its type is one of {@link PxlFileType#ORDERED} or {@link PxlFileType#COMPACT}. * *

    {@link PxlFileType#SINGLE} and {@link PxlFileType#COPY} files, as well as * unrecognised paths, return {@code false}. @@ -290,7 +279,7 @@ public static PxlFileType extractFileType(String path) public static boolean isGcEligible(String path) { PxlFileType type = extractFileType(path); - return type == PxlFileType.RETINA || type == PxlFileType.ORDERED || type == PxlFileType.COMPACT; + return type == PxlFileType.ORDERED || type == PxlFileType.COMPACT; } // ------------------------------------------------------------------------- diff --git a/pixels-retina/src/main/java/io/pixelsdb/pixels/retina/FileWriterManager.java b/pixels-retina/src/main/java/io/pixelsdb/pixels/retina/FileWriterManager.java index 8a4960e1e..570335ad3 100644 --- a/pixels-retina/src/main/java/io/pixelsdb/pixels/retina/FileWriterManager.java +++ b/pixels-retina/src/main/java/io/pixelsdb/pixels/retina/FileWriterManager.java @@ -76,7 +76,7 @@ public FileWriterManager(long tableId, TypeDescription schema, this.virtualNodeId = virtualNodeId; // Create pixels writer. - String targetFileName = PixelsFileNameUtils.buildRetinaFileName(hostName, virtualNodeId); + String targetFileName = PixelsFileNameUtils.buildOrderedFileName(hostName, virtualNodeId); String targetFilePath = targetOrderedDirPath.getUri() + "/" + targetFileName; try { From 9d5289fb115a5b2d245a819eb96070f1ae35dc8e Mon Sep 17 00:00:00 2001 From: Dongyang Geng Date: Sat, 28 Mar 2026 21:02:40 +0800 Subject: [PATCH 5/7] feat: produce gcSnapshotBitmap during Memory GC for zero-traversal checkpointing --- cpp/pixels-retina/include/RGVisibility.h | 2 +- cpp/pixels-retina/include/RGVisibilityJni.h | 4 +- cpp/pixels-retina/include/TileVisibility.h | 2 +- cpp/pixels-retina/lib/RGVisibility.cpp | 11 +- cpp/pixels-retina/lib/RGVisibilityJni.cpp | 11 +- cpp/pixels-retina/lib/TileVisibility.cpp | 42 ++- cpp/pixels-retina/test/RGVisibilityTest.cpp | 280 ++++++++++++++++++ cpp/pixels-retina/test/TileVisibilityTest.cpp | 21 +- .../pixelsdb/pixels/retina/RGVisibility.java | 6 +- .../pixels/retina/RetinaResourceManager.java | 29 +- .../pixels/retina/TestRGVisibility.java | 228 +++++++++++++- 11 files changed, 602 insertions(+), 34 deletions(-) diff --git a/cpp/pixels-retina/include/RGVisibility.h b/cpp/pixels-retina/include/RGVisibility.h index f1f29a014..3f1ff1bcd 100644 --- a/cpp/pixels-retina/include/RGVisibility.h +++ b/cpp/pixels-retina/include/RGVisibility.h @@ -33,7 +33,7 @@ class RGVisibility : public pixels::RetinaBase> { void deleteRGRecord(uint32_t rowId, uint64_t timestamp); uint64_t* getRGVisibilityBitmap(uint64_t timestamp); - void collectRGGarbage(uint64_t timestamp); + std::vector collectRGGarbage(uint64_t timestamp); uint64_t getBitmapSize() const; diff --git a/cpp/pixels-retina/include/RGVisibilityJni.h b/cpp/pixels-retina/include/RGVisibilityJni.h index 37339942d..d4cbb372f 100644 --- a/cpp/pixels-retina/include/RGVisibilityJni.h +++ b/cpp/pixels-retina/include/RGVisibilityJni.h @@ -50,9 +50,9 @@ JNIEXPORT jlongArray JNICALL Java_io_pixelsdb_pixels_retina_RGVisibility_getVisi /* * Class: io_pixelsdb_pixels_retina_RGVisibility * Method: garbageCollect - * Signature: (JJ)V + * Signature: (JJ)[J */ -JNIEXPORT void JNICALL Java_io_pixelsdb_pixels_retina_RGVisibility_garbageCollect +JNIEXPORT jlongArray JNICALL Java_io_pixelsdb_pixels_retina_RGVisibility_garbageCollect (JNIEnv *, jobject, jlong, jlong); /* diff --git a/cpp/pixels-retina/include/TileVisibility.h b/cpp/pixels-retina/include/TileVisibility.h index 51bf11351..af299dcc7 100644 --- a/cpp/pixels-retina/include/TileVisibility.h +++ b/cpp/pixels-retina/include/TileVisibility.h @@ -97,7 +97,7 @@ class TileVisibility : public pixels::RetinaBase> { ~TileVisibility() override; void deleteTileRecord(uint16_t rowId, uint64_t ts); void getTileVisibilityBitmap(uint64_t ts, uint64_t* outBitmap) const; - void collectTileGarbage(uint64_t ts); + void collectTileGarbage(uint64_t ts, uint64_t* gcSnapshotBitmap); private: TileVisibility(const TileVisibility &) = delete; diff --git a/cpp/pixels-retina/lib/RGVisibility.cpp b/cpp/pixels-retina/lib/RGVisibility.cpp index 47c210702..f6e33a175 100644 --- a/cpp/pixels-retina/lib/RGVisibility.cpp +++ b/cpp/pixels-retina/lib/RGVisibility.cpp @@ -62,11 +62,14 @@ RGVisibility::~RGVisibility() { } template -void RGVisibility::collectRGGarbage(uint64_t timestamp) { -// TileVisibility::collectTileGarbage uses COW + Epoch, so it's safe to call concurrently - for (uint64_t i = 0; i < tileCount; i++) { - tileVisibilities[i].collectTileGarbage(timestamp); +std::vector RGVisibility::collectRGGarbage(uint64_t timestamp) { + size_t totalWords = tileCount * BITMAP_SIZE_PER_TILE_VISIBILITY; + std::vector rgSnapshot(totalWords, 0); + for (uint32_t t = 0; t < tileCount; t++) { + tileVisibilities[t].collectTileGarbage(timestamp, + rgSnapshot.data() + t * BITMAP_SIZE_PER_TILE_VISIBILITY); } + return rgSnapshot; } template diff --git a/cpp/pixels-retina/lib/RGVisibilityJni.cpp b/cpp/pixels-retina/lib/RGVisibilityJni.cpp index ff655f8a7..88a3ce7ea 100644 --- a/cpp/pixels-retina/lib/RGVisibilityJni.cpp +++ b/cpp/pixels-retina/lib/RGVisibilityJni.cpp @@ -129,15 +129,20 @@ JNIEXPORT jlongArray JNICALL Java_io_pixelsdb_pixels_retina_RGVisibility_getVisi /* * Class: io_pixelsdb_pixels_retina_RGVisibility * Method: garbageCollect - * Signature: (JJ)V + * Signature: (JJ)[J */ -JNIEXPORT void JNICALL Java_io_pixelsdb_pixels_retina_RGVisibility_garbageCollect +JNIEXPORT jlongArray JNICALL Java_io_pixelsdb_pixels_retina_RGVisibility_garbageCollect (JNIEnv* env, jobject, jlong timestamp, jlong handle) { try { auto* rgVisibility = reinterpret_cast(handle); - rgVisibility->collectRGGarbage(timestamp); + std::vector snapshot = rgVisibility->collectRGGarbage(timestamp); + jlongArray result = env->NewLongArray(snapshot.size()); + env->SetLongArrayRegion(result, 0, snapshot.size(), + reinterpret_cast(snapshot.data())); + return result; } catch (const std::exception& e) { env->ThrowNew(env->FindClass("java/lang/RuntimeException"), e.what()); + return nullptr; } } diff --git a/cpp/pixels-retina/lib/TileVisibility.cpp b/cpp/pixels-retina/lib/TileVisibility.cpp index 3195d7aa3..4e02e6605 100644 --- a/cpp/pixels-retina/lib/TileVisibility.cpp +++ b/cpp/pixels-retina/lib/TileVisibility.cpp @@ -238,7 +238,7 @@ void TileVisibility::getTileVisibilityBitmap(uint64_t ts, uint64_t* ou } template -void TileVisibility::collectTileGarbage(uint64_t ts) { +void TileVisibility::collectTileGarbage(uint64_t ts, uint64_t* gcSnapshotBitmap) { // Drain the pending retirement slot left by deleteTileRecord's empty-chain path. VersionedData* pending = pendingRetire.exchange(nullptr, std::memory_order_acquire); if (pending) { @@ -248,7 +248,12 @@ void TileVisibility::collectTileGarbage(uint64_t ts) { // Load old version VersionedData* oldVer = currentVersion.load(std::memory_order_acquire); - if (ts <= oldVer->baseTimestamp) return; + + // Early return A: safeGcTs <= baseTimestamp, nothing to compact + if (ts <= oldVer->baseTimestamp) { + std::memcpy(gcSnapshotBitmap, oldVer->baseBitmap, NUM_WORDS * sizeof(uint64_t)); + return; + } // Find the last block that should be compacted DeleteIndexBlock *blk = oldVer->head; @@ -275,7 +280,22 @@ void TileVisibility::collectTileGarbage(uint64_t ts) { blk = blk->next.load(std::memory_order_acquire); } - if (!lastFullBlk) return; + // Early return B: no compactable block + if (!lastFullBlk) { + std::memcpy(gcSnapshotBitmap, oldVer->baseBitmap, NUM_WORDS * sizeof(uint64_t)); + if (oldVer->head) { + auto* tailSnap = tail.load(std::memory_order_acquire); + size_t tailUsedSnap = tailUsed.load(std::memory_order_acquire); + size_t cnt = (oldVer->head == tailSnap) ? tailUsedSnap : DeleteIndexBlock::BLOCK_CAPACITY; + for (size_t i = 0; i < cnt; i++) { + uint64_t item = oldVer->head->items[i]; + if (item == 0) break; + if (extractTimestamp(item) <= ts) SET_BITMAP_BIT(gcSnapshotBitmap, extractRowId(item)); + else break; + } + } + return; + } // Create new version with Copy-on-Write // Manually compute the new base bitmap from oldVer @@ -304,8 +324,22 @@ void TileVisibility::collectTileGarbage(uint64_t ts) { blk = blk->next.load(std::memory_order_acquire); } - // Get new head and break the chain to avoid double-free + // Compact path: build gcSnapshotBitmap by scanning the boundary block DeleteIndexBlock* newHead = lastFullBlk->next.load(std::memory_order_acquire); + std::memcpy(gcSnapshotBitmap, newBaseBitmap, NUM_WORDS * sizeof(uint64_t)); + if (newHead) { + auto* tailSnap = tail.load(std::memory_order_acquire); + size_t tailUsedSnap = tailUsed.load(std::memory_order_acquire); + size_t cnt = (newHead == tailSnap) ? tailUsedSnap : DeleteIndexBlock::BLOCK_CAPACITY; + for (size_t i = 0; i < cnt; i++) { + uint64_t item = newHead->items[i]; + if (item == 0) break; + if (extractTimestamp(item) <= ts) SET_BITMAP_BIT(gcSnapshotBitmap, extractRowId(item)); + else break; + } + } + + // Break the chain to avoid double-free lastFullBlk->next.store(nullptr, std::memory_order_release); // Create new version with new head - this is the atomic COW update diff --git a/cpp/pixels-retina/test/RGVisibilityTest.cpp b/cpp/pixels-retina/test/RGVisibilityTest.cpp index b1e6daf63..da69abc63 100644 --- a/cpp/pixels-retina/test/RGVisibilityTest.cpp +++ b/cpp/pixels-retina/test/RGVisibilityTest.cpp @@ -235,4 +235,284 @@ TEST_F(RGVisibilityTest, MultiThread) { delete[] finalBitmap; delete[] expectedFinalBitmap; +} + +// ===================================================================== +// gcSnapshotBitmap correctness tests +// +// Core verification: the bitmap returned by collectRGGarbage (gcSnapshotBitmap) +// must be bitwise identical to getRGVisibilityBitmap called BEFORE GC. +// +// Why pre-GC reference matters: +// getRGVisibilityBitmap traverses the full, unmodified deletion chain — it is +// a completely independent computation from the GC code path. Comparing +// gcSnapshotBitmap with a post-GC getRGVisibilityBitmap is weaker because +// both read from state that GC just modified; a bug that corrupts the compact +// AND the snapshot identically would go undetected. +// +// Each test also verifies that post-GC queries still return correct results +// (regression check on the compact logic itself). +// +// Covers all three code paths in collectTileGarbage: +// A — ts <= baseTimestamp (early return, no compaction) +// B — chain exists but no full block compactable +// C — one or more blocks compacted (with/without boundary block) +// ===================================================================== + +static void compareBitmaps( + const uint64_t* actual, const uint64_t* expected, uint64_t size, uint64_t ts, + const char* actualLabel, const char* expectedLabel) +{ + for (size_t i = 0; i < size; i++) { + EXPECT_EQ(actual[i], expected[i]) + << "Word " << i << " (rows " << (i * 64) << "-" << (i * 64 + 63) + << ") at ts=" << ts + << "\n " << actualLabel << ": " << std::bitset<64>(actual[i]) + << "\n " << expectedLabel << ": " << std::bitset<64>(expected[i]); + } +} + +static void verifyGcSnapshot( + RGVisibilityInstance* rgv, uint64_t ts, + const uint64_t* preGcRef, const std::vector& snapshot) +{ + uint64_t bitmapSize = rgv->getBitmapSize(); + ASSERT_EQ(snapshot.size(), bitmapSize); + + // Primary check: gcSnapshotBitmap must match the pre-GC ground truth + compareBitmaps(snapshot.data(), preGcRef, bitmapSize, ts, + "gcSnapshot", "preGcRef"); + + // Secondary check: post-GC query must also agree (compact regression) + uint64_t* postGcRef = rgv->getRGVisibilityBitmap(ts); + compareBitmaps(snapshot.data(), postGcRef, bitmapSize, ts, + "gcSnapshot", "postGcQuery"); + delete[] postGcRef; +} + +// Path A: empty chain → all-zero snapshot; then repeat GC at same ts → early return A +TEST_F(RGVisibilityTest, GcSnapshot_EarlyReturnA) { + // Empty chain: baseTimestamp=0, ts=100 > 0 → enters path B with null head + uint64_t* preRef0 = rgVisibility->getRGVisibilityBitmap(100); + std::vector snap0 = rgVisibility->collectRGGarbage(100); + verifyGcSnapshot(rgVisibility, 100, preRef0, snap0); + delete[] preRef0; + for (auto w : snap0) { + EXPECT_EQ(w, 0ULL); + } + + // Add deletes and compact to advance baseTimestamp + rgVisibility->deleteRGRecord(5, 100); + rgVisibility->deleteRGRecord(10, 100); + rgVisibility->deleteRGRecord(15, 200); + + // First GC at ts=200 → compact all 3 items → baseTimestamp becomes 200 + uint64_t* preRef1 = rgVisibility->getRGVisibilityBitmap(200); + std::vector snap1 = rgVisibility->collectRGGarbage(200); + verifyGcSnapshot(rgVisibility, 200, preRef1, snap1); + delete[] preRef1; + + // Second GC at ts=200 → ts == baseTimestamp → true early return A + uint64_t* preRef2 = rgVisibility->getRGVisibilityBitmap(200); + std::vector snap2 = rgVisibility->collectRGGarbage(200); + verifyGcSnapshot(rgVisibility, 200, preRef2, snap2); + delete[] preRef2; + + ASSERT_EQ(snap1.size(), snap2.size()); + for (size_t i = 0; i < snap1.size(); i++) { + EXPECT_EQ(snap1[i], snap2[i]); + } +} + +// Path B: chain exists, head block straddles safeGcTs → no compactable block +TEST_F(RGVisibilityTest, GcSnapshot_EarlyReturnB) { + // 5 items in one block: ts 1,2,3,8,10. Block last ts=10 > safeGcTs=5 + rgVisibility->deleteRGRecord(0, 1); + rgVisibility->deleteRGRecord(1, 2); + rgVisibility->deleteRGRecord(2, 3); + rgVisibility->deleteRGRecord(3, 8); + rgVisibility->deleteRGRecord(4, 10); + + uint64_t* preRef = rgVisibility->getRGVisibilityBitmap(5); + std::vector snapshot = rgVisibility->collectRGGarbage(5); + verifyGcSnapshot(rgVisibility, 5, preRef, snapshot); + delete[] preRef; + + // Rows 0,1,2 marked (ts ≤ 5); rows 3,4 not (ts 8,10 > 5) + EXPECT_EQ(snapshot[0], 0b111ULL); +} + +// Path B variant: all items in head block have ts > safeGcTs +TEST_F(RGVisibilityTest, GcSnapshot_EarlyReturnB_NoneMatch) { + rgVisibility->deleteRGRecord(0, 10); + rgVisibility->deleteRGRecord(1, 20); + + uint64_t* preRef = rgVisibility->getRGVisibilityBitmap(5); + std::vector snapshot = rgVisibility->collectRGGarbage(5); + verifyGcSnapshot(rgVisibility, 5, preRef, snapshot); + delete[] preRef; + + EXPECT_EQ(snapshot[0], 0ULL); +} + +// Path C: one full block compacted + boundary block with mixed items +TEST_F(RGVisibilityTest, GcSnapshot_CompactWithBoundary) { + // 10 items: rows 0-9, ts 1-10 + // Block 1 (8 items, ts 1-8): last ts=8 ≤ 9 → compactable + // Block 2 (2 items, ts 9-10): boundary block + for (uint32_t i = 0; i < 10; i++) { + rgVisibility->deleteRGRecord(i, i + 1); + } + + uint64_t* preRef = rgVisibility->getRGVisibilityBitmap(9); + std::vector snapshot = rgVisibility->collectRGGarbage(9); + verifyGcSnapshot(rgVisibility, 9, preRef, snapshot); + delete[] preRef; + + // Rows 0-8 marked (ts 1-9 ≤ 9), row 9 not (ts 10 > 9) + EXPECT_EQ(snapshot[0], 0x1FFULL); // bits 0-8 +} + +// Path C: all blocks fully compacted, no remaining chain +TEST_F(RGVisibilityTest, GcSnapshot_CompactAllBlocks) { + // Exactly 8 items fill one block: rows 0-7, ts 1-8 + for (uint32_t i = 0; i < 8; i++) { + rgVisibility->deleteRGRecord(i, i + 1); + } + + // safeGcTs=10 > all item ts → entire block compacted, newHead=null + uint64_t* preRef = rgVisibility->getRGVisibilityBitmap(10); + std::vector snapshot = rgVisibility->collectRGGarbage(10); + verifyGcSnapshot(rgVisibility, 10, preRef, snapshot); + delete[] preRef; + + EXPECT_EQ(snapshot[0], 0xFFULL); // bits 0-7 +} + +// Path C: multiple blocks compacted before a boundary block +TEST_F(RGVisibilityTest, GcSnapshot_CompactMultiBlock) { + // 20 items: rows 0-19, ts 1-20 + // Block 1 (ts 1-8), Block 2 (ts 9-16), Block 3 tail (ts 17-20) + // safeGcTs=18: blocks 1,2 compacted, block 3 is boundary + for (uint32_t i = 0; i < 20; i++) { + rgVisibility->deleteRGRecord(i, i + 1); + } + + uint64_t* preRef = rgVisibility->getRGVisibilityBitmap(18); + std::vector snapshot = rgVisibility->collectRGGarbage(18); + verifyGcSnapshot(rgVisibility, 18, preRef, snapshot); + delete[] preRef; + + // Rows 0-17 marked (ts 1-18 ≤ 18), rows 18-19 not + EXPECT_EQ(snapshot[0], (1ULL << 18) - 1); +} + +// Multiple deletes sharing the same timestamp (batch deletes) +TEST_F(RGVisibilityTest, GcSnapshot_SameTimestamp) { + rgVisibility->deleteRGRecord(0, 5); + rgVisibility->deleteRGRecord(1, 5); + rgVisibility->deleteRGRecord(2, 5); + rgVisibility->deleteRGRecord(3, 10); + rgVisibility->deleteRGRecord(4, 10); + + uint64_t* preRef = rgVisibility->getRGVisibilityBitmap(5); + std::vector snapshot = rgVisibility->collectRGGarbage(5); + verifyGcSnapshot(rgVisibility, 5, preRef, snapshot); + delete[] preRef; + + EXPECT_EQ(snapshot[0], 0b111ULL); +} + +// Deletes spanning multiple tiles (RETINA_CAPACITY=256 rows per tile) +TEST_F(RGVisibilityTest, GcSnapshot_CrossTile) { + // Tile 0: rows 0-255 Tile 1: rows 256-511 + // Tile 2: rows 512-767 + rgVisibility->deleteRGRecord(5, 1); // tile 0 + rgVisibility->deleteRGRecord(10, 2); // tile 0 + rgVisibility->deleteRGRecord(260, 3); // tile 1, localRow 4 + rgVisibility->deleteRGRecord(600, 4); // tile 2, localRow 88 + rgVisibility->deleteRGRecord(100, 5); // tile 0 + rgVisibility->deleteRGRecord(300, 6); // tile 1, localRow 44 + + uint64_t* preRef1 = rgVisibility->getRGVisibilityBitmap(4); + std::vector snapshot = rgVisibility->collectRGGarbage(4); + verifyGcSnapshot(rgVisibility, 4, preRef1, snapshot); + delete[] preRef1; + + // After GC at ts=4, also verify a higher ts sees more deletes + uint64_t* preRef2 = rgVisibility->getRGVisibilityBitmap(6); + std::vector snap2 = rgVisibility->collectRGGarbage(6); + verifyGcSnapshot(rgVisibility, 6, preRef2, snap2); + delete[] preRef2; +} + +// Progressive GC rounds with interleaved inserts +TEST_F(RGVisibilityTest, GcSnapshot_ProgressiveRounds) { + // Phase 1: 20 deletes at ts 1-20 + for (uint32_t i = 0; i < 20; i++) { + rgVisibility->deleteRGRecord(i, i + 1); + } + + uint64_t* preRef1 = rgVisibility->getRGVisibilityBitmap(5); + std::vector snap1 = rgVisibility->collectRGGarbage(5); + verifyGcSnapshot(rgVisibility, 5, preRef1, snap1); + delete[] preRef1; + + uint64_t* preRef2 = rgVisibility->getRGVisibilityBitmap(12); + std::vector snap2 = rgVisibility->collectRGGarbage(12); + verifyGcSnapshot(rgVisibility, 12, preRef2, snap2); + delete[] preRef2; + + // Phase 2: 10 more deletes at ts 21-30 + for (uint32_t i = 20; i < 30; i++) { + rgVisibility->deleteRGRecord(i, i + 1); + } + + uint64_t* preRef3 = rgVisibility->getRGVisibilityBitmap(25); + std::vector snap3 = rgVisibility->collectRGGarbage(25); + verifyGcSnapshot(rgVisibility, 25, preRef3, snap3); + delete[] preRef3; + + // Final GC beyond all timestamps + uint64_t* preRef4 = rgVisibility->getRGVisibilityBitmap(100); + std::vector snap4 = rgVisibility->collectRGGarbage(100); + verifyGcSnapshot(rgVisibility, 100, preRef4, snap4); + delete[] preRef4; + + // All 30 rows should be marked + EXPECT_EQ(snap4[0], (1ULL << 30) - 1); +} + +// Randomized: random deletes across all tiles, verify at each GC round +TEST_F(RGVisibilityTest, GcSnapshot_Randomized) { + std::mt19937 gen(42); + std::uniform_int_distribution rowDist(0, ROW_COUNT - 1); + std::vector deleted(ROW_COUNT, false); + uint64_t ts = 1; + uint64_t lastGcTs = 0; + + for (int round = 0; round < 10; round++) { + for (int d = 0; d < 100; d++) { + uint32_t rowId; + do { rowId = rowDist(gen); } while (deleted[rowId]); + deleted[rowId] = true; + rgVisibility->deleteRGRecord(rowId, ts); + ts++; + } + + uint64_t gcTs = lastGcTs + 51; + if (gcTs >= ts) gcTs = ts - 1; + + uint64_t* preRef = rgVisibility->getRGVisibilityBitmap(gcTs); + std::vector snapshot = rgVisibility->collectRGGarbage(gcTs); + verifyGcSnapshot(rgVisibility, gcTs, preRef, snapshot); + delete[] preRef; + lastGcTs = gcTs; + } + + // Final GC beyond all timestamps + uint64_t* preRefFinal = rgVisibility->getRGVisibilityBitmap(ts + 100); + std::vector finalSnap = rgVisibility->collectRGGarbage(ts + 100); + verifyGcSnapshot(rgVisibility, ts + 100, preRefFinal, finalSnap); + delete[] preRefFinal; } \ No newline at end of file diff --git a/cpp/pixels-retina/test/TileVisibilityTest.cpp b/cpp/pixels-retina/test/TileVisibilityTest.cpp index 60fdcc802..122baf588 100644 --- a/cpp/pixels-retina/test/TileVisibilityTest.cpp +++ b/cpp/pixels-retina/test/TileVisibilityTest.cpp @@ -68,6 +68,11 @@ class TileVisibilityTest : public ::testing::Test { return true; } + void collectGarbage(uint64_t ts) { + uint64_t buf[BITMAP_SIZE] = {0}; + v->collectTileGarbage(ts, buf); + } + TileVisibility* v; }; @@ -89,7 +94,7 @@ TEST_F(TileVisibilityTest, BaseFunction) { SET_BITMAP_BIT(expectedBitmap, 2); EXPECT_TRUE(checkBitmap(actualBitmap, expectedBitmap)); - v->collectTileGarbage(101); + collectGarbage(101); v->getTileVisibilityBitmap(101, actualBitmap); EXPECT_TRUE(checkBitmap(actualBitmap, expectedBitmap)); } @@ -111,7 +116,7 @@ TEST_F(TileVisibilityTest, GarbageCollect) { for (int i = 0; i < count; i++) { v->deleteTileRecord(i, i + 100); } - v->collectTileGarbage(150); + collectGarbage(150); uint64_t actualBitmap[BITMAP_SIZE] = {0}; uint64_t expectedBitmap[BITMAP_SIZE] = {0}; @@ -123,7 +128,7 @@ TEST_F(TileVisibilityTest, GarbageCollect) { for (int i = 51; i < count; i++) { SET_BITMAP_BIT(expectedBitmap, i); } - v->collectTileGarbage(100 + count); + collectGarbage(100 + count); v->getTileVisibilityBitmap(100 + count, actualBitmap); EXPECT_TRUE(checkBitmap(actualBitmap, expectedBitmap)); } @@ -300,7 +305,7 @@ TEST_F(TileVisibilityTest, ZeroSentinelInGarbageCollect) { // during the Scenario-2 race if tailUsed were stale at BLOCK_CAPACITY. v->deleteTileRecord(0, 0); - v->collectTileGarbage(100); + collectGarbage(100); uint64_t actualBitmap[BITMAP_SIZE] = {0}; v->getTileVisibilityBitmap(100, actualBitmap); @@ -355,7 +360,7 @@ TEST_F(TileVisibilityTest, ConcurrentGCAndFirstInsert) { // then immediately fires GC to maximise the chance of hitting the race window. auto gcThread = std::thread([&]() { while (!deleteStarted.load(std::memory_order_acquire)) {} - v->collectTileGarbage(1000); + collectGarbage(1000); gcDone.store(true, std::memory_order_release); }); @@ -369,7 +374,7 @@ TEST_F(TileVisibilityTest, ConcurrentGCAndFirstInsert) { // After both operations complete, GC with a ts that covers the inserted item // and verify the bitmap is exactly {row 5 deleted}. - v->collectTileGarbage(1000); + collectGarbage(1000); uint64_t actualBitmap[BITMAP_SIZE] = {0}; v->getTileVisibilityBitmap(1000, actualBitmap); @@ -429,7 +434,7 @@ TEST_F(TileVisibilityTest, ConcurrentGCAndBlockTransition) { // then fires GC immediately to race with tail/tailUsed update. auto gcThread = std::thread([&]() { while (!insertReady.load(std::memory_order_acquire)) {} - v->collectTileGarbage(GC_TS); + collectGarbage(GC_TS); }); // Insert thread: signal then insert the (BLOCK_CAPACITY+1)-th item to force @@ -440,7 +445,7 @@ TEST_F(TileVisibilityTest, ConcurrentGCAndBlockTransition) { gcThread.join(); // Run one more clean GC to ensure everything that should be compacted is. - v->collectTileGarbage(GC_TS); + collectGarbage(GC_TS); // Check the canary: bit 0 must be 0 because row 0 was never deleted. uint64_t bitmap[BITMAP_SIZE] = {0}; diff --git a/pixels-retina/src/main/java/io/pixelsdb/pixels/retina/RGVisibility.java b/pixels-retina/src/main/java/io/pixelsdb/pixels/retina/RGVisibility.java index a986717f9..a2abad86c 100644 --- a/pixels-retina/src/main/java/io/pixelsdb/pixels/retina/RGVisibility.java +++ b/pixels-retina/src/main/java/io/pixelsdb/pixels/retina/RGVisibility.java @@ -104,7 +104,7 @@ public void close() private native void destroyNativeObject(long nativeHandle); private native void deleteRecord(int rgRowOffset, long timestamp, long nativeHandle); private native long[] getVisibilityBitmap(long timestamp, long nativeHandle); - private native void garbageCollect(long timestamp, long nativeHandle); + private native long[] garbageCollect(long timestamp, long nativeHandle); private static native long getNativeMemoryUsage(); private static native long getRetinaTrackedMemoryUsage(); private static native long getRetinaObjectCount(); @@ -132,7 +132,7 @@ public long[] getVisibilityBitmap(long timestamp) return bitmap; } - public void garbageCollect(long timestamp) + public long[] garbageCollect(long timestamp) { long handle = this.nativeHandle.get(); if (handle == 0) @@ -140,7 +140,7 @@ public void garbageCollect(long timestamp) throw new IllegalStateException("RGVisibility instance has been closed."); } - garbageCollect(timestamp, handle); + return garbageCollect(timestamp, handle); } /** diff --git a/pixels-retina/src/main/java/io/pixelsdb/pixels/retina/RetinaResourceManager.java b/pixels-retina/src/main/java/io/pixelsdb/pixels/retina/RetinaResourceManager.java index e2e37bbc2..f748bf140 100644 --- a/pixels-retina/src/main/java/io/pixelsdb/pixels/retina/RetinaResourceManager.java +++ b/pixels-retina/src/main/java/io/pixelsdb/pixels/retina/RetinaResourceManager.java @@ -281,6 +281,12 @@ public void unregisterOffload(long timestamp) } private CompletableFuture createCheckpoint(long timestamp, CheckpointType type) throws RetinaException + { + return createCheckpoint(timestamp, type, null); + } + + private CompletableFuture createCheckpoint( + long timestamp, CheckpointType type, Map precomputedBitmaps) throws RetinaException { String prefix = (type == CheckpointType.GC) ? RetinaUtils.CHECKPOINT_PREFIX_GC : RetinaUtils.CHECKPOINT_PREFIX_OFFLOAD; String fileName = RetinaUtils.getCheckpointFileName(prefix, retinaHostName, timestamp); @@ -292,10 +298,9 @@ private CompletableFuture createCheckpoint(long timestamp, CheckpointType logger.info("Starting {} checkpoint for {} RGs at timestamp {}", type, totalRgs, timestamp); // 2. Use a BlockingQueue for producer-consumer pattern - // Limit capacity to avoid excessive memory usage if writing is slow BlockingQueue queue = new LinkedBlockingQueue<>(1024); - // 3. Start producer tasks to fetch bitmaps in parallel + // 3. Start producer tasks to fetch bitmaps for (Map.Entry entry : entries) { checkpointExecutor.submit(() -> { @@ -306,7 +311,14 @@ private CompletableFuture createCheckpoint(long timestamp, CheckpointType long fileId = Long.parseLong(parts[0]); int rgId = Integer.parseInt(parts[1]); RGVisibility rgVisibility = entry.getValue(); - long[] bitmap = rgVisibility.getVisibilityBitmap(timestamp); + long[] bitmap; + if (precomputedBitmaps != null && precomputedBitmaps.containsKey(key)) + { + bitmap = precomputedBitmaps.get(key); + } else + { + bitmap = rgVisibility.getVisibilityBitmap(timestamp); + } queue.put(new CheckpointFileIO.CheckpointEntry(fileId, rgId, (int) rgVisibility.getRecordNum(), bitmap)); } catch (Exception e) { @@ -628,14 +640,17 @@ private void runGC() try { // Step 1: Memory GC — compact Deletion Chain into baseBitmap. + // Collect gcSnapshotBitmaps produced as a side-effect of compaction. + Map gcSnapshotBitmaps = new HashMap<>(); for (Map.Entry entry : this.rgVisibilityMap.entrySet()) { - entry.getValue().garbageCollect(timestamp); + long[] bitmap = entry.getValue().garbageCollect(timestamp); + gcSnapshotBitmaps.put(entry.getKey(), bitmap); } - // Step 2: Persist the post-Memory-GC visibility state. Block until the - // checkpoint file is fully written (see Javadoc above for why we join here). - createCheckpoint(timestamp, CheckpointType.GC).join(); + // Step 2: Persist the post-Memory-GC visibility state using precomputed + // bitmaps (zero chain traversal). Block until fully written. + createCheckpoint(timestamp, CheckpointType.GC, gcSnapshotBitmaps).join(); // Step 3: Storage GC — rewrite high-deletion-ratio files (TODO). // if (storageGcEnabled) storageGarbageCollector.runStorageGC(); diff --git a/pixels-retina/src/test/java/io/pixelsdb/pixels/retina/TestRGVisibility.java b/pixels-retina/src/test/java/io/pixelsdb/pixels/retina/TestRGVisibility.java index d21384b8e..0bea890fc 100644 --- a/pixels-retina/src/test/java/io/pixelsdb/pixels/retina/TestRGVisibility.java +++ b/pixels-retina/src/test/java/io/pixelsdb/pixels/retina/TestRGVisibility.java @@ -57,7 +57,16 @@ public void testRGVisibilityInitialized() long timestamp1 = 100; long timestamp2 = 200; - long[] bitmap = {1, 0, 0, 0}; + // Probe the native library to determine per-tile bitmap size, + // which depends on RETINA_CAPACITY set at compile time. + int bitmapWords; + try (RGVisibility probe = new RGVisibility(1)) + { + bitmapWords = probe.getVisibilityBitmap(0).length; + } + + long[] bitmap = new long[bitmapWords]; + bitmap[0] = 1; RGVisibility rgVisibilityInitialized = new RGVisibility(256, 0, bitmap); rgVisibilityInitialized.deleteRecord(5, timestamp1); @@ -69,6 +78,8 @@ public void testRGVisibilityInitialized() long[] bitmap2 = rgVisibilityInitialized.getVisibilityBitmap(timestamp2); assertEquals(0b1000010000100001L, bitmap2[0]); + + rgVisibilityInitialized.close(); } @Test @@ -388,4 +399,219 @@ class DeleteRecord verifyBitmap.accept(maxTimestamp.get(), finalBitmap); } + + // ===================================================================== + // gcSnapshotBitmap JNI round-trip tests + // + // Verify that garbageCollect() (now returning long[]) produces a bitmap + // identical to getVisibilityBitmap() called BEFORE GC — the independent + // ground truth computed from the full, unmodified deletion chain. + // ===================================================================== + + private static void assertBitmapsEqual(String msg, long[] expected, long[] actual) + { + assertEquals(msg + ": length mismatch", expected.length, actual.length); + for (int i = 0; i < expected.length; i++) + { + if (expected[i] != actual[i]) + { + fail(String.format("%s: word %d (rows %d-%d) mismatch%n expected: %s%n actual: %s", + msg, i, i * 64, i * 64 + 63, + String.format("%64s", Long.toBinaryString(expected[i])).replace(' ', '0'), + String.format("%64s", Long.toBinaryString(actual[i])).replace(' ', '0'))); + } + } + } + + @Test + public void testGcSnapshotEarlyReturnA() + { + // Empty chain → all-zero snapshot + long[] preRef0 = rgVisibility.getVisibilityBitmap(100); + long[] snap0 = rgVisibility.garbageCollect(100); + assertBitmapsEqual("empty chain", preRef0, snap0); + for (long w : snap0) + { + assertEquals(0L, w); + } + + // Add deletes and compact to advance baseTimestamp + rgVisibility.deleteRecord(5, 100); + rgVisibility.deleteRecord(10, 100); + rgVisibility.deleteRecord(15, 200); + + // First GC at ts=200 → compact all items + long[] preRef1 = rgVisibility.getVisibilityBitmap(200); + long[] snap1 = rgVisibility.garbageCollect(200); + assertBitmapsEqual("first compact", preRef1, snap1); + + // Second GC at ts=200 → early return A (ts == baseTimestamp) + long[] preRef2 = rgVisibility.getVisibilityBitmap(200); + long[] snap2 = rgVisibility.garbageCollect(200); + assertBitmapsEqual("repeat GC", preRef2, snap2); + + // Both snapshots must be identical + assertBitmapsEqual("snap1 vs snap2", snap1, snap2); + } + + @Test + public void testGcSnapshotEarlyReturnB() + { + // 5 items in one block: ts 1,2,3,8,10. Block last ts=10 > safeGcTs=5 + rgVisibility.deleteRecord(0, 1); + rgVisibility.deleteRecord(1, 2); + rgVisibility.deleteRecord(2, 3); + rgVisibility.deleteRecord(3, 8); + rgVisibility.deleteRecord(4, 10); + + long[] preRef = rgVisibility.getVisibilityBitmap(5); + long[] snapshot = rgVisibility.garbageCollect(5); + assertBitmapsEqual("early return B", preRef, snapshot); + + // Rows 0,1,2 marked (ts ≤ 5); rows 3,4 not + assertEquals(0b111L, snapshot[0]); + } + + @Test + public void testGcSnapshotCompactWithBoundary() + { + // 10 items: rows 0-9, ts 1-10 + // Block 1 (8 items, ts 1-8): compactable at safeGcTs=9 + // Block 2 (2 items, ts 9-10): boundary block (tail) + for (int i = 0; i < 10; i++) + { + rgVisibility.deleteRecord(i, i + 1); + } + + long[] preRef = rgVisibility.getVisibilityBitmap(9); + long[] snapshot = rgVisibility.garbageCollect(9); + assertBitmapsEqual("compact with boundary", preRef, snapshot); + + // Rows 0-8 marked, row 9 not + assertEquals(0x1FFL, snapshot[0]); + } + + @Test + public void testGcSnapshotCompactAllBlocks() + { + // 8 items fill one block: rows 0-7, ts 1-8 + for (int i = 0; i < 8; i++) + { + rgVisibility.deleteRecord(i, i + 1); + } + + // safeGcTs=10 > all item ts → entire block compacted + long[] preRef = rgVisibility.getVisibilityBitmap(10); + long[] snapshot = rgVisibility.garbageCollect(10); + assertBitmapsEqual("compact all blocks", preRef, snapshot); + + assertEquals(0xFFL, snapshot[0]); + } + + @Test + public void testGcSnapshotCompactMultiBlock() + { + // 20 items: rows 0-19, ts 1-20 + // Block 1 (ts 1-8), Block 2 (ts 9-16), Block 3 tail (ts 17-20) + for (int i = 0; i < 20; i++) + { + rgVisibility.deleteRecord(i, i + 1); + } + + // safeGcTs=18: blocks 1,2 compacted, block 3 is boundary + long[] preRef = rgVisibility.getVisibilityBitmap(18); + long[] snapshot = rgVisibility.garbageCollect(18); + assertBitmapsEqual("compact multi-block", preRef, snapshot); + + // Rows 0-17 marked + assertEquals((1L << 18) - 1, snapshot[0]); + } + + @Test + public void testGcSnapshotCrossTile() + { + // Tile 0: rows 0-255 Tile 1: rows 256-511 Tile 2: rows 512-767 + rgVisibility.deleteRecord(5, 1); + rgVisibility.deleteRecord(10, 2); + rgVisibility.deleteRecord(260, 3); // tile 1 + rgVisibility.deleteRecord(600, 4); // tile 2 + rgVisibility.deleteRecord(100, 5); // tile 0 + rgVisibility.deleteRecord(300, 6); // tile 1 + + long[] preRef1 = rgVisibility.getVisibilityBitmap(4); + long[] snap1 = rgVisibility.garbageCollect(4); + assertBitmapsEqual("cross-tile ts=4", preRef1, snap1); + + long[] preRef2 = rgVisibility.getVisibilityBitmap(6); + long[] snap2 = rgVisibility.garbageCollect(6); + assertBitmapsEqual("cross-tile ts=6", preRef2, snap2); + } + + @Test + public void testGcSnapshotProgressiveRounds() + { + // Phase 1: 20 deletes at ts 1-20 + for (int i = 0; i < 20; i++) + { + rgVisibility.deleteRecord(i, i + 1); + } + + long[] preRef1 = rgVisibility.getVisibilityBitmap(5); + long[] snap1 = rgVisibility.garbageCollect(5); + assertBitmapsEqual("round 1", preRef1, snap1); + + long[] preRef2 = rgVisibility.getVisibilityBitmap(12); + long[] snap2 = rgVisibility.garbageCollect(12); + assertBitmapsEqual("round 2", preRef2, snap2); + + // Phase 2: 10 more deletes at ts 21-30 + for (int i = 20; i < 30; i++) + { + rgVisibility.deleteRecord(i, i + 1); + } + + long[] preRef3 = rgVisibility.getVisibilityBitmap(25); + long[] snap3 = rgVisibility.garbageCollect(25); + assertBitmapsEqual("round 3", preRef3, snap3); + + long[] preRef4 = rgVisibility.getVisibilityBitmap(100); + long[] snap4 = rgVisibility.garbageCollect(100); + assertBitmapsEqual("round 4", preRef4, snap4); + + // All 30 rows marked + assertEquals((1L << 30) - 1, snap4[0]); + } + + @Test + public void testGcSnapshotRandomized() + { + Random rng = new Random(42); + boolean[] deleted = new boolean[ROW_COUNT]; + long ts = 1; + long lastGcTs = 0; + + for (int round = 0; round < 10; round++) + { + for (int d = 0; d < 100; d++) + { + int rowId; + do { rowId = rng.nextInt(ROW_COUNT); } while (deleted[rowId]); + deleted[rowId] = true; + rgVisibility.deleteRecord(rowId, ts); + ts++; + } + + long gcTs = lastGcTs + 51; + if (gcTs >= ts) gcTs = ts - 1; + + long[] preRef = rgVisibility.getVisibilityBitmap(gcTs); + long[] snapshot = rgVisibility.garbageCollect(gcTs); + assertBitmapsEqual("randomized round " + round, preRef, snapshot); + lastGcTs = gcTs; + } + + long[] preRefFinal = rgVisibility.getVisibilityBitmap(ts + 100); + long[] finalSnap = rgVisibility.garbageCollect(ts + 100); + assertBitmapsEqual("randomized final", preRefFinal, finalSnap); + } } \ No newline at end of file From 9b910d453f90019ed7ab84b45dac00ae9f9ea7f2 Mon Sep 17 00:00:00 2001 From: Dongyang Geng Date: Sun, 29 Mar 2026 16:44:28 +0800 Subject: [PATCH 6/7] feat: implement StorageGC framework with file scanning and grouping --- .../src/main/resources/pixels.properties | 12 +- .../reader/TestVisibilityCheckpointCache.java | 2 +- .../pixels/retina/RetinaResourceManager.java | 73 ++- .../retina/StorageGarbageCollector.java | 399 +++++++++++++ .../pixels/retina/TestRetinaCheckpoint.java | 173 +++++- .../retina/TestStorageGarbageCollector.java | 548 ++++++++++++++++++ 6 files changed, 1190 insertions(+), 17 deletions(-) create mode 100644 pixels-retina/src/main/java/io/pixelsdb/pixels/retina/StorageGarbageCollector.java create mode 100644 pixels-retina/src/test/java/io/pixelsdb/pixels/retina/TestStorageGarbageCollector.java diff --git a/pixels-common/src/main/resources/pixels.properties b/pixels-common/src/main/resources/pixels.properties index cc9bd3122..1d4aa2b66 100644 --- a/pixels-common/src/main/resources/pixels.properties +++ b/pixels-common/src/main/resources/pixels.properties @@ -265,7 +265,7 @@ cpuspl = [10000,60000,300000,600000] # split mem (G) memspl = [1,8,16,32,64] -### pixels-retina - write buffer flush configuration ### +### pixels-retina ### # set to true to enable pixels-retina retina.enable=false @@ -304,7 +304,15 @@ pixels.transaction.offload.threshold=1800 # lease duration for retina offload cache in seconds, default 600s retina.offload.cache.lease.duration=600 # snapshot storage directory -pixels.retina.checkpoint.dir=file:///tmp/pixels-checkpoints +retina.checkpoint.dir=file:///tmp/pixels-checkpoints +# set to true to enable storage GC (rewrites high-deletion-ratio files to reclaim space) +retina.storage.gc.enabled=false +# invalidRatio must be strictly greater than this value for a file to be a GC candidate +retina.storage.gc.threshold=0.5 +# target size in bytes for rewritten files produced by Storage GC, default 128MB +retina.storage.gc.target.file.size=134217728 +# maximum number of (tableId, virtualNodeId) file groups processed per GC cycle +retina.storage.gc.max.file.groups.per.run=10 ### pixels-sink ### sink.server.enabled=false diff --git a/pixels-core/src/test/java/io/pixelsdb/pixels/core/reader/TestVisibilityCheckpointCache.java b/pixels-core/src/test/java/io/pixelsdb/pixels/core/reader/TestVisibilityCheckpointCache.java index ed1bcbe97..e4ca0e304 100644 --- a/pixels-core/src/test/java/io/pixelsdb/pixels/core/reader/TestVisibilityCheckpointCache.java +++ b/pixels-core/src/test/java/io/pixelsdb/pixels/core/reader/TestVisibilityCheckpointCache.java @@ -46,7 +46,7 @@ public class TestVisibilityCheckpointCache @Before public void setUp() throws IOException { - testCheckpointDir = ConfigFactory.Instance().getProperty("pixels.retina.checkpoint.dir"); + testCheckpointDir = ConfigFactory.Instance().getProperty("retina.checkpoint.dir"); storage = StorageFactory.Instance().getStorage(testCheckpointDir); if (!storage.exists(testCheckpointDir)) diff --git a/pixels-retina/src/main/java/io/pixelsdb/pixels/retina/RetinaResourceManager.java b/pixels-retina/src/main/java/io/pixelsdb/pixels/retina/RetinaResourceManager.java index f748bf140..33854b290 100644 --- a/pixels-retina/src/main/java/io/pixelsdb/pixels/retina/RetinaResourceManager.java +++ b/pixels-retina/src/main/java/io/pixelsdb/pixels/retina/RetinaResourceManager.java @@ -63,6 +63,8 @@ public class RetinaResourceManager // GC related fields private final ScheduledExecutorService gcExecutor; + private final boolean storageGcEnabled; + private final StorageGarbageCollector storageGarbageCollector; // Checkpoint related fields private final ExecutorService checkpointExecutor; @@ -91,7 +93,7 @@ private RetinaResourceManager() ConfigFactory config = ConfigFactory.Instance(); this.checkpointRefCounts = new ConcurrentHashMap<>(); - this.checkpointDir = config.getProperty("pixels.retina.checkpoint.dir"); + this.checkpointDir = config.getProperty("retina.checkpoint.dir"); int cpThreads = Integer.parseInt(config.getProperty("retina.checkpoint.threads")); this.checkpointExecutor = Executors.newFixedThreadPool(cpThreads, r -> { @@ -124,6 +126,31 @@ private RetinaResourceManager() this.gcExecutor = executor; totalVirtualNodeNum = Integer.parseInt(ConfigFactory.Instance().getProperty("node.virtual.num")); this.retinaHostName = NetUtils.getLocalHostName(); + + boolean gcEnabled = false; + StorageGarbageCollector gc = null; + try + { + gcEnabled = Boolean.parseBoolean(config.getProperty("retina.storage.gc.enabled")); + if (gcEnabled) + { + double threshold = Double.parseDouble(config.getProperty("retina.storage.gc.threshold")); + long targetFileSize = Long.parseLong(config.getProperty("retina.storage.gc.target.file.size")); + int maxGroups = Integer.parseInt(config.getProperty("retina.storage.gc.max.file.groups.per.run")); + gc = new StorageGarbageCollector(this, this.metadataService, + threshold, targetFileSize, maxGroups); + logger.info("Storage GC enabled (threshold={}, targetFileSize={}, maxGroups={})", + threshold, targetFileSize, maxGroups); + } + } + catch (Exception e) + { + logger.error("Failed to initialise StorageGarbageCollector, Storage GC will be disabled", e); + gcEnabled = false; + gc = null; + } + this.storageGcEnabled = gcEnabled; + this.storageGarbageCollector = gc; } private static final class InstanceHolder @@ -595,7 +622,7 @@ private PixelsWriteBuffer checkPixelsWriteBuffer(String schema, String table, in } /** - * Run a full GC cycle: Memory GC → checkpoint → Storage GC (future). + * Run a full GC cycle: Memory GC → checkpoint → Storage GC (S1 scan + future S2-S6). * *

    Ordering rationale: *

      @@ -605,16 +632,18 @@ private PixelsWriteBuffer checkPixelsWriteBuffer(String schema, String table, in * {@code getVisibilityBitmap(lwm)} call traverses at most one partial block * (≤ {@code BLOCK_CAPACITY} items) instead of the entire pre-GC chain. This makes * checkpoint bitmap serialisation significantly cheaper. - *
    1. Checkpoint second, blocking: we call {@code .join()} so that - * {@code runGC()} does not return until the checkpoint file is fully written to disk. - * {@code gcExecutor} is a single-threaded scheduler; because {@code runGC()} must - * complete before the next invocation begins, this blocking join is the simplest way - * to guarantee that no two GC cycles ever overlap — no additional lock is required.
    2. + *
    3. Checkpoint second, unconditional and blocking: written regardless of whether + * Storage GC finds any candidate files. The {@code .join()} ensures the checkpoint + * file is fully on disk before Storage GC begins rewriting any files, so crash + * recovery can always restore the post-Memory-GC visibility state independently of + * any in-progress Storage GC rewrite. {@code gcExecutor} is single-threaded, so the + * blocking join is also the simplest way to guarantee no two GC cycles overlap.
    4. *
    5. Storage GC third: requires an up-to-date {@code baseBitmap} (hence after * Memory GC) and its own WAL for crash recovery. Placing it after the checkpoint * keeps the two recovery paths independent: on restart, the GC checkpoint restores * the post-Memory-GC visibility state, and the GcWal resumes any in-progress Storage - * GC task separately.
    6. + * GC task separately. Once scan completes, bitmaps for non-candidate files are + * immediately released from memory (they are no longer needed by S2-S6). *
    7. Advance {@code latestGcTimestamp} last: updated only after the entire cycle * succeeds (Memory GC + checkpoint + Storage GC). If any step throws, the timestamp * is not advanced and the next scheduled invocation will retry the full cycle.
    8. @@ -640,20 +669,38 @@ private void runGC() try { // Step 1: Memory GC — compact Deletion Chain into baseBitmap. - // Collect gcSnapshotBitmaps produced as a side-effect of compaction. + // In the same pass, pre-compute per-RG stats (recordNum, invalidCount) that + // Storage GC needs for S1 candidate selection, avoiding a second traversal. Map gcSnapshotBitmaps = new HashMap<>(); + Map rgStats = new HashMap<>(); // rgKey → {recordNum, invalidCount} for (Map.Entry entry : this.rgVisibilityMap.entrySet()) { long[] bitmap = entry.getValue().garbageCollect(timestamp); gcSnapshotBitmaps.put(entry.getKey(), bitmap); + + long recordNum = entry.getValue().getRecordNum(); + long invalidCount = 0; + for (long word : bitmap) invalidCount += Long.bitCount(word); + rgStats.put(entry.getKey(), new long[]{recordNum, invalidCount}); } - // Step 2: Persist the post-Memory-GC visibility state using precomputed - // bitmaps (zero chain traversal). Block until fully written. + // Step 2: Checkpoint — always written unconditionally with the full visibility + // snapshot before any Storage GC rewriting begins. createCheckpoint(timestamp, CheckpointType.GC, gcSnapshotBitmaps).join(); - // Step 3: Storage GC — rewrite high-deletion-ratio files (TODO). - // if (storageGcEnabled) storageGarbageCollector.runStorageGC(); + // Step 3: Storage GC — scan for high-deletion-ratio files; on finding candidates, + // trim non-candidate bitmaps from memory and log (S2-S6 added in Tasks 3-6). + if (storageGcEnabled && storageGarbageCollector != null) + { + try + { + storageGarbageCollector.runStorageGC(timestamp, rgStats, gcSnapshotBitmaps); + } + catch (Exception e) + { + logger.error("Storage GC failed", e); + } + } // Step 4: Advance the timestamp only after the full cycle succeeds. // latestGcTimestamp is no longer updated inside createCheckpoint's async diff --git a/pixels-retina/src/main/java/io/pixelsdb/pixels/retina/StorageGarbageCollector.java b/pixels-retina/src/main/java/io/pixelsdb/pixels/retina/StorageGarbageCollector.java new file mode 100644 index 000000000..33cdb87b9 --- /dev/null +++ b/pixels-retina/src/main/java/io/pixelsdb/pixels/retina/StorageGarbageCollector.java @@ -0,0 +1,399 @@ +/* + * Copyright 2026 PixelsDB. + * + * This file is part of Pixels. + * + * Pixels is free software: you can redistribute it and/or modify + * it under the terms of the Affero GNU General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * Pixels is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Affero GNU General Public License for more details. + * + * You should have received a copy of the Affero GNU General Public + * License along with Pixels. If not, see + * . + */ +package io.pixelsdb.pixels.retina; + +import io.pixelsdb.pixels.common.exception.MetadataException; +import io.pixelsdb.pixels.common.metadata.MetadataService; +import io.pixelsdb.pixels.common.metadata.domain.File; +import io.pixelsdb.pixels.common.metadata.domain.Layout; +import io.pixelsdb.pixels.common.metadata.domain.Path; +import io.pixelsdb.pixels.common.metadata.domain.Schema; +import io.pixelsdb.pixels.common.metadata.domain.Table; +import io.pixelsdb.pixels.common.utils.PixelsFileNameUtils; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; + +import java.util.ArrayList; +import java.util.Comparator; +import java.util.HashSet; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; + +/** + * Storage GC: identifies high-deletion-ratio files and (in later tasks) rewrites them + * to reclaim physical storage while keeping active queries unaffected. + * + *

      In Task 2, only S1 (scan + grouping) and in-memory bitmap trimming are implemented; + * S2-S6 rewrite steps are added in Tasks 3-6. + * + *

      Checkpoint ownership: the GC checkpoint is always written unconditionally + * by {@link RetinaResourceManager#runGC()} after Memory GC and before this + * class is invoked. {@code StorageGarbageCollector} never writes checkpoints. + */ +public class StorageGarbageCollector +{ + private static final Logger logger = LogManager.getLogger(StorageGarbageCollector.class); + + private final RetinaResourceManager resourceManager; + private final MetadataService metadataService; + final double gcThreshold; + private final long targetFileSize; + private final int maxFileGroupsPerRun; + + /** + * Tracks file IDs currently being processed by an ongoing Storage GC cycle. + * Prevents the same file from being picked up by a second concurrent scan. + */ + private final ConcurrentHashMap processingFiles = new ConcurrentHashMap<>(); + + // ------------------------------------------------------------------------- + // Value types + // ------------------------------------------------------------------------- + + /** + * Metadata about a single candidate file: its invalid-row ratio exceeds + * {@link #gcThreshold} and it is eligible for GC rewrite. + */ + static final class FileCandidate + { + final File file; + final String filePath; + final long fileId; + final int rgCount; + final long tableId; + final int virtualNodeId; + final long totalRows; + final double invalidRatio; + + FileCandidate(File file, String filePath, long fileId, int rgCount, + long tableId, int virtualNodeId, long totalRows, double invalidRatio) + { + this.file = file; + this.filePath = filePath; + this.fileId = fileId; + this.rgCount = rgCount; + this.tableId = tableId; + this.virtualNodeId = virtualNodeId; + this.totalRows = totalRows; + this.invalidRatio = invalidRatio; + } + } + + /** + * A group of candidate files sharing the same {@code (tableId, virtualNodeId)}. + * Files within the same group may be rewritten together while preserving row-ordering invariants. + */ + static final class FileGroup + { + final long tableId; + final int virtualNodeId; + final List files; + + FileGroup(long tableId, int virtualNodeId, List files) + { + this.tableId = tableId; + this.virtualNodeId = virtualNodeId; + this.files = files; + } + } + + // ------------------------------------------------------------------------- + // Constructor + // ------------------------------------------------------------------------- + + StorageGarbageCollector(RetinaResourceManager resourceManager, + MetadataService metadataService, + double gcThreshold, + long targetFileSize, + int maxFileGroupsPerRun) + { + this.resourceManager = resourceManager; + this.metadataService = metadataService; + this.gcThreshold = gcThreshold; + this.targetFileSize = targetFileSize; + this.maxFileGroupsPerRun = maxFileGroupsPerRun; + } + + // ------------------------------------------------------------------------- + // Public entry point + // ------------------------------------------------------------------------- + + /** + * Runs one Storage GC cycle: S1 scan → group → process candidates. + * + *

      The GC checkpoint has already been written unconditionally by + * {@link RetinaResourceManager#runGC()} before this method is called. + * This method is solely responsible for identifying candidate files and + * processing them (S2-S6 added in Tasks 3-6). + * + * @param safeGcTs safe GC timestamp produced by Memory GC + * @param rgStats per-RG visibility statistics pre-computed during Memory GC; + * key = {@code "_"}, + * value = {@code long[]{recordNum, invalidCount}} + * @param gcSnapshotBitmaps per-RG snapshot bitmaps (mutated in-place by + * {@link #processFileGroups}: non-candidate entries removed) + */ + void runStorageGC(long safeGcTs, Map rgStats, + Map gcSnapshotBitmaps) + { + List fileGroups = scanAndGroupFiles(rgStats); + if (!fileGroups.isEmpty()) + { + processFileGroups(fileGroups, safeGcTs, gcSnapshotBitmaps); + } + } + + // ------------------------------------------------------------------------- + // S1 — scan and group + // ------------------------------------------------------------------------- + + /** + * Scans all schemas/tables and returns at most {@link #maxFileGroupsPerRun} groups of + * candidate files, sorted by average {@code invalidRatio} descending. + * + * @param rgStats per-RG visibility statistics pre-computed by + * {@link RetinaResourceManager#runGC()} during the Memory GC pass; + * key = {@code "_"}, value = {@code long[]{recordNum, invalidCount}} + * where {@code invalidCount} is the number of deleted rows (bitmap popcount). + * Using pre-computed stats avoids re-traversing the visibility map and + * re-computing bitcounts that were already produced during Memory GC. + */ + List scanAndGroupFiles(Map rgStats) + { + List candidates = new ArrayList<>(); + + List schemas; + try + { + schemas = metadataService.getSchemas(); + } + catch (MetadataException e) + { + logger.error("Storage GC S1: failed to retrieve schemas", e); + return new ArrayList<>(); + } + + for (Schema schema : schemas) + { + List

    TypeWriterGC eligible
    retina{@code FileWriterManager} (CDC real-time path)yes
    ordered{@code IndexedPixelsConsumer} (indexed batch load)yes
    ordered{@code FileWriterManager} (CDC real-time path) / + * {@code IndexedPixelsConsumer} (indexed batch load)yes
    compact{@code CompactExecutor}yes
    single{@code SimplePixelsConsumer} (non-indexed batch load)no
    copy{@code CopyExecutor} (test/benchmark data amplification)no
    tables; + try + { + tables = metadataService.getTables(schema.getName()); + } + catch (MetadataException e) + { + logger.warn("Storage GC S1: failed to get tables for schema '{}', skipping", + schema.getName(), e); + continue; + } + + for (Table table : tables) + { + Layout layout; + try + { + layout = metadataService.getLatestLayout(schema.getName(), table.getName()); + } + catch (MetadataException e) + { + logger.warn("Storage GC S1: failed to get layout for {}.{}, skipping", + schema.getName(), table.getName(), e); + continue; + } + if (layout == null) + { + continue; + } + + // Scan both ordered and compact paths; single/copy files are filtered + // by isGcEligible() inside the inner loop. + List paths = new ArrayList<>(); + paths.addAll(layout.getOrderedPaths()); + paths.addAll(layout.getCompactPaths()); + + for (Path path : paths) + { + List files; + try + { + files = metadataService.getFiles(path.getId()); + } + catch (MetadataException e) + { + logger.warn("Storage GC S1: failed to get files for pathId={}, skipping", + path.getId(), e); + continue; + } + + for (File file : files) + { + String filePath = File.getFilePath(path, file); + + // Skip non-GC-eligible types (single, copy) and already-processing files. + if (!PixelsFileNameUtils.isGcEligible(filePath)) + { + continue; + } + if (processingFiles.containsKey(file.getId())) + { + continue; + } + + // Aggregate file-level stats from the pre-computed per-RG rgStats. + // rgStats[rgKey] = {recordNum, invalidCount}; entries absent from the + // map mean the RG has no visibility record and should be skipped. + long totalRows = 0; + long invalidCount = 0; + for (int rgId = 0; rgId < file.getNumRowGroup(); rgId++) + { + String rgKey = file.getId() + "_" + rgId; + long[] stats = rgStats.get(rgKey); + if (stats == null) + { + continue; + } + totalRows += stats[0]; + invalidCount += stats[1]; + } + if (totalRows == 0) + { + continue; + } + + double invalidRatio = (double) invalidCount / totalRows; + if (invalidRatio > gcThreshold) + { + int vNodeId = PixelsFileNameUtils.extractVirtualNodeId(filePath); + candidates.add(new FileCandidate( + file, filePath, file.getId(), file.getNumRowGroup(), + table.getId(), vNodeId, totalRows, invalidRatio)); + } + } + } + } + } + + return groupAndMerge(candidates); + } + + /** + * Groups candidates by {@code (tableId, virtualNodeId)}, sorts each group by + * {@code invalidRatio} descending, then returns the top-N groups (by average + * {@code invalidRatio}) capped at {@link #maxFileGroupsPerRun}. + */ + List groupAndMerge(List candidates) + { + // Two-level map: tableId → virtualNodeId → files + Map>> grouped = new LinkedHashMap<>(); + for (FileCandidate c : candidates) + { + grouped.computeIfAbsent(c.tableId, k -> new LinkedHashMap<>()) + .computeIfAbsent(c.virtualNodeId, k -> new ArrayList<>()) + .add(c); + } + + List groups = new ArrayList<>(); + for (Map.Entry>> tableEntry : grouped.entrySet()) + { + for (Map.Entry> vnodeEntry : tableEntry.getValue().entrySet()) + { + List files = vnodeEntry.getValue(); + files.sort(Comparator.comparingDouble((FileCandidate c) -> c.invalidRatio).reversed()); + groups.add(new FileGroup(tableEntry.getKey(), vnodeEntry.getKey(), files)); + } + } + + // Sort groups by average invalidRatio descending so the worst groups are processed first. + groups.sort(Comparator.comparingDouble( + (FileGroup g) -> g.files.stream().mapToDouble(c -> c.invalidRatio).average().orElse(0.0)) + .reversed()); + + if (groups.size() > maxFileGroupsPerRun) + { + return groups.subList(0, maxFileGroupsPerRun); + } + return groups; + } + + /** + * Processes the candidate file groups produced by S1. + * + *

    Because the GC checkpoint has already been written unconditionally in + * {@link RetinaResourceManager#runGC()} with the full visibility snapshot + * before this method is called, bitmaps for non-candidate files are no longer needed + * for S2-S6. This method therefore immediately trims {@code gcSnapshotBitmaps} to + * retain only the entries for candidate files, releasing memory for files that were + * either below the threshold or excluded by the {@link #maxFileGroupsPerRun} cap. + * + *

    Currently (Task 2) only logging is performed after the trim; S2-S6 rewrite steps + * will be added in Tasks 3-6. + * + * @param fileGroups non-empty list of candidate groups from {@link #scanAndGroupFiles} + * @param safeGcTs safe GC timestamp produced by Memory GC + * @param gcSnapshotBitmaps per-RG snapshot bitmaps (mutated in-place: non-candidate + * entries are removed to reduce memory pressure) + */ + void processFileGroups(List fileGroups, long safeGcTs, + Map gcSnapshotBitmaps) + { + // Release bitmap entries for files that were not selected (below threshold or + // over the maxFileGroupsPerRun cap). The checkpoint has already been written with + // the full snapshot, so S2-S6 only needs bitmaps for candidate files. + Set candidateRgKeys = collectCandidateRgKeys(fileGroups); + gcSnapshotBitmaps.entrySet().removeIf(e -> !candidateRgKeys.contains(e.getKey())); + + // Log candidate groups; S2-S6 will be added in later tasks. + for (FileGroup group : fileGroups) + { + double avgRatio = group.files.stream() + .mapToDouble(c -> c.invalidRatio).average().orElse(0.0); + logger.info("StorageGC candidate: table={}, vNodeId={}, files={}, avgInvalidRatio={:.4f}", + group.tableId, group.virtualNodeId, group.files.size(), avgRatio); + // TODO Task 3+: processFileGroup(group, safeGcTs, gcSnapshotBitmaps) + // Note: gcSnapshotBitmaps entries for this group must be released inside + // processFileGroup() *after* S2 (row-filtering rewrite) completes, + // because S2 reads the bitmaps to determine which rows are alive. + } + } + + /** + * Collects all {@code "_"} keys for every RG in every candidate file + * across all groups. Used by {@link #processFileGroups} to trim the + * {@code gcSnapshotBitmaps} map after the checkpoint has been written. + */ + Set collectCandidateRgKeys(List fileGroups) + { + Set keys = new HashSet<>(); + for (FileGroup group : fileGroups) + { + for (FileCandidate c : group.files) + { + for (int rgId = 0; rgId < c.rgCount; rgId++) + { + keys.add(c.fileId + "_" + rgId); + } + } + } + return keys; + } +} diff --git a/pixels-retina/src/test/java/io/pixelsdb/pixels/retina/TestRetinaCheckpoint.java b/pixels-retina/src/test/java/io/pixelsdb/pixels/retina/TestRetinaCheckpoint.java index 2f0606c92..6a9b7fb50 100644 --- a/pixels-retina/src/test/java/io/pixelsdb/pixels/retina/TestRetinaCheckpoint.java +++ b/pixels-retina/src/test/java/io/pixelsdb/pixels/retina/TestRetinaCheckpoint.java @@ -22,6 +22,7 @@ import io.pixelsdb.pixels.common.exception.RetinaException; import io.pixelsdb.pixels.common.physical.Storage; import io.pixelsdb.pixels.common.physical.StorageFactory; +import io.pixelsdb.pixels.common.utils.CheckpointFileIO; import io.pixelsdb.pixels.common.utils.ConfigFactory; import io.pixelsdb.pixels.common.utils.RetinaUtils; import org.junit.Before; @@ -31,8 +32,12 @@ import java.io.DataOutputStream; import java.io.IOException; import java.lang.reflect.Field; +import java.lang.reflect.Method; import java.net.InetAddress; +import java.util.Arrays; +import java.util.HashMap; import java.util.Map; +import java.util.concurrent.CompletableFuture; import java.util.concurrent.CountDownLatch; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; @@ -40,7 +45,10 @@ import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.ThreadLocalRandom; +import static org.junit.Assert.assertArrayEquals; +import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertTrue; /** @@ -59,7 +67,7 @@ public class TestRetinaCheckpoint @Before public void setUp() throws IOException, RetinaException { - testCheckpointDir = ConfigFactory.Instance().getProperty("pixels.retina.checkpoint.dir"); + testCheckpointDir = ConfigFactory.Instance().getProperty("retina.checkpoint.dir"); storage = StorageFactory.Instance().getStorage(testCheckpointDir); if (!storage.exists(testCheckpointDir)) @@ -555,4 +563,167 @@ private boolean isBitSet(long[] bitmap, int rowIndex) return (bitmap[longIndex] & (1L << bitOffset)) != 0; } + + // ----------------------------------------------------------------------- + // GC checkpoint: completeness + bitmap correctness + // ----------------------------------------------------------------------- + + /** + * Creates a {@code long[]} GC snapshot bitmap for one RG where exactly {@code deletedRows} + * out of {@code totalRows} rows are marked deleted (rows 0..deletedRows-1 are set). + */ + private static long[] makeBitmap(int totalRows, int deletedRows) + { + int words = (totalRows + 63) / 64; + long[] bitmap = new long[words]; + for (int r = 0; r < deletedRows; r++) + { + bitmap[r / 64] |= (1L << (r % 64)); + } + return bitmap; + } + + /** + * Calls {@code RetinaResourceManager.createCheckpoint(ts, CheckpointType.GC, bitmaps)} + * via reflection and blocks until the write completes. + */ + @SuppressWarnings("unchecked") + private void invokeCreateGCCheckpoint(long ts, Map bitmaps) throws Exception + { + // Locate the private CheckpointType enum class + Class cpTypeClass = Arrays.stream(RetinaResourceManager.class.getDeclaredClasses()) + .filter(c -> c.getSimpleName().equals("CheckpointType")) + .findFirst() + .orElseThrow(() -> new RuntimeException("CheckpointType enum not found")); + + // Get the GC constant + Object gcConstant = Arrays.stream(cpTypeClass.getEnumConstants()) + .filter(e -> e.toString().equals("GC")) + .findFirst() + .orElseThrow(() -> new RuntimeException("CheckpointType.GC not found")); + + // Get the overloaded createCheckpoint(long, CheckpointType, Map) method + Method method = RetinaResourceManager.class.getDeclaredMethod( + "createCheckpoint", long.class, cpTypeClass, Map.class); + method.setAccessible(true); + + CompletableFuture future = (CompletableFuture) method.invoke( + retinaManager, ts, gcConstant, bitmaps); + future.join(); + } + + /** + * Verifies that a GC checkpoint written with a full {@code gcSnapshotBitmaps} map + * contains ALL RG entries — including those that would not be selected as Storage GC + * candidates — because the checkpoint is written before S1 scanning begins. + * + *

    Setup: 3 files in {@code rgVisibilityMap}: + *

      + *
    • File A: 80 % deleted (would be a candidate)
    • + *
    • File B: 60 % deleted (would be a candidate)
    • + *
    • File C: 20 % deleted (non-candidate)
    • + *
    + * + *

    Expected: checkpoint rgCount = 3; all three entries present with correct + * {@code recordNum} and bitmap content. + */ + @Test + public void testGCCheckpoint_containsAllRGs() throws Exception + { + final long fileIdA = 77001L; + final long fileIdB = 77002L; + final long fileIdC = 77003L; + final int rows = 100; + final long safeGcTs = 500L; + + retinaManager.addVisibility(fileIdA, 0, rows); + retinaManager.addVisibility(fileIdB, 0, rows); + retinaManager.addVisibility(fileIdC, 0, rows); + + long[] bitmapA = makeBitmap(rows, 80); + long[] bitmapB = makeBitmap(rows, 60); + long[] bitmapC = makeBitmap(rows, 20); + + Map gcBitmaps = new HashMap<>(); + gcBitmaps.put(fileIdA + "_0", bitmapA); + gcBitmaps.put(fileIdB + "_0", bitmapB); + gcBitmaps.put(fileIdC + "_0", bitmapC); + + invokeCreateGCCheckpoint(safeGcTs, gcBitmaps); + + String cpPath = resolve(testCheckpointDir, getGcFileName(safeGcTs)); + assertTrue("GC checkpoint file must exist", storage.exists(cpPath)); + + Map entries = new HashMap<>(); + int rgCount = CheckpointFileIO.readCheckpointParallel(cpPath, + e -> entries.put(e.fileId + "_" + e.rgId, e)); + + assertEquals("checkpoint must contain all 3 RGs (not just candidates)", 3, rgCount); + assertEquals("entries map size must be 3", 3, entries.size()); + + CheckpointFileIO.CheckpointEntry entA = entries.get(fileIdA + "_0"); + assertNotNull("fileIdA must be present", entA); + assertEquals("fileIdA recordNum", rows, entA.recordNum); + assertArrayEquals("fileIdA bitmap must match", bitmapA, entA.bitmap); + + CheckpointFileIO.CheckpointEntry entB = entries.get(fileIdB + "_0"); + assertNotNull("fileIdB must be present", entB); + assertEquals("fileIdB recordNum", rows, entB.recordNum); + assertArrayEquals("fileIdB bitmap must match", bitmapB, entB.bitmap); + + CheckpointFileIO.CheckpointEntry entC = entries.get(fileIdC + "_0"); + assertNotNull("fileIdC (non-candidate) must be present", entC); + assertEquals("fileIdC recordNum", rows, entC.recordNum); + assertArrayEquals("fileIdC bitmap must match", bitmapC, entC.bitmap); + } + + /** + * Verifies that the GC checkpoint bitmap content faithfully matches the + * {@code gcSnapshotBitmaps} passed to {@code createCheckpoint}: each word of each + * per-RG bitmap must be preserved exactly, with no cross-RG contamination. + * + *

    Uses a 2-RG file with deliberately complementary bitmaps: + *

      + *
    • RG 0: first word all-ones ({@code rows 0-63} deleted), second word zero
    • + *
    • RG 1: first word zero, second word all-ones ({@code rows 64-127} deleted)
    • + *
    + */ + @Test + public void testGCCheckpoint_bitmapContentIsExact() throws Exception + { + final long fileId = 88001L; + final int rows = 128; // 2 words per RG + final long safeGcTs = 600L; + + retinaManager.addVisibility(fileId, 0, rows); + retinaManager.addVisibility(fileId, 1, rows); + + long[] bitmapRg0 = new long[]{-1L, 0L}; // rows 0-63 deleted + long[] bitmapRg1 = new long[]{0L, -1L}; // rows 64-127 deleted + + Map gcBitmaps = new HashMap<>(); + gcBitmaps.put(fileId + "_0", bitmapRg0); + gcBitmaps.put(fileId + "_1", bitmapRg1); + + invokeCreateGCCheckpoint(safeGcTs, gcBitmaps); + + String cpPath = resolve(testCheckpointDir, getGcFileName(safeGcTs)); + assertTrue("GC checkpoint file must exist", storage.exists(cpPath)); + + Map entries = new HashMap<>(); + int rgCount = CheckpointFileIO.readCheckpointParallel(cpPath, + e -> entries.put(e.fileId + "_" + e.rgId, e)); + + assertEquals("checkpoint must contain 2 RGs", 2, rgCount); + + CheckpointFileIO.CheckpointEntry rg0 = entries.get(fileId + "_0"); + assertNotNull("RG 0 must be present", rg0); + assertEquals("RG 0 word 0 must be all-ones (rows 0-63 deleted)", -1L, rg0.bitmap[0]); + assertEquals("RG 0 word 1 must be zero (rows 64-127 live)", 0L, rg0.bitmap[1]); + + CheckpointFileIO.CheckpointEntry rg1 = entries.get(fileId + "_1"); + assertNotNull("RG 1 must be present", rg1); + assertEquals("RG 1 word 0 must be zero (rows 0-63 live)", 0L, rg1.bitmap[0]); + assertEquals("RG 1 word 1 must be all-ones (rows 64-127 deleted)", -1L, rg1.bitmap[1]); + } } diff --git a/pixels-retina/src/test/java/io/pixelsdb/pixels/retina/TestStorageGarbageCollector.java b/pixels-retina/src/test/java/io/pixelsdb/pixels/retina/TestStorageGarbageCollector.java new file mode 100644 index 000000000..d2ee51c2f --- /dev/null +++ b/pixels-retina/src/test/java/io/pixelsdb/pixels/retina/TestStorageGarbageCollector.java @@ -0,0 +1,548 @@ +/* + * Copyright 2026 PixelsDB. + * + * This file is part of Pixels. + * + * Pixels is free software: you can redistribute it and/or modify + * it under the terms of the Affero GNU General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * Pixels is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Affero GNU General Public License for more details. + * + * You should have received a copy of the Affero GNU General Public + * License along with Pixels. If not, see + * . + */ +package io.pixelsdb.pixels.retina; + +import io.pixelsdb.pixels.common.metadata.domain.File; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +import java.lang.reflect.Field; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +/** + * Verifies Task 2: StorageGarbageCollector S1 scan + grouping + bitmap trim. + * + *

    Verification points

    + *
      + *
    1. groupAndMerge correctness — candidates are grouped by {@code (tableId, virtualNodeId)}, + * sorted by average {@code invalidRatio} descending, and capped at {@code maxFileGroupsPerRun}.
    2. + *
    3. S1 threshold filtering — only files with {@code invalidRatio > gcThreshold} are + * included; e.g., threshold=0.5 → 60 % and 80 % selected, 40 % excluded.
    4. + *
    5. Bitmap trim after scan — {@code processFileGroups} retains only candidate RG bitmaps + * from {@code gcSnapshotBitmaps}, releasing memory for files not selected for rewrite.
    6. + *
    7. System stability — in Task 2, {@code processFileGroups} only trims and logs; + * S2-S6 are not triggered and the visibility map is not modified.
    8. + *
    + */ +public class TestStorageGarbageCollector +{ + // ----------------------------------------------------------------------- + // Test infrastructure + // ----------------------------------------------------------------------- + + private RetinaResourceManager retinaManager; + + @Before + public void setUp() + { + retinaManager = RetinaResourceManager.Instance(); + resetManagerState(); + } + + @After + public void tearDown() + { + resetManagerState(); + } + + // ----------------------------------------------------------------------- + // Helpers + // ----------------------------------------------------------------------- + + /** Clears RetinaResourceManager internal maps and resets latestGcTimestamp. */ + private void resetManagerState() + { + try + { + Field rgMapField = RetinaResourceManager.class.getDeclaredField("rgVisibilityMap"); + rgMapField.setAccessible(true); + ((Map) rgMapField.get(retinaManager)).clear(); + + Field gcTsField = RetinaResourceManager.class.getDeclaredField("latestGcTimestamp"); + gcTsField.setAccessible(true); + gcTsField.setLong(retinaManager, -1L); + } + catch (Exception e) + { + throw new RuntimeException("Failed to reset RetinaResourceManager state", e); + } + } + + /** Creates a minimal {@link File} domain object with given id and rgCount. */ + private static File makeFile(long fileId, int rgCount) + { + File f = new File(); + f.setId(fileId); + f.setNumRowGroup(rgCount); + return f; + } + + /** + * Creates a {@code long[]} GC snapshot bitmap for one RG where exactly {@code deletedRows} + * out of {@code totalRows} rows are marked as deleted (rows 0 .. deletedRows-1 are set). + */ + private static long[] makeBitmap(int totalRows, int deletedRows) + { + int words = (totalRows + 63) / 64; + long[] bitmap = new long[words]; + for (int r = 0; r < deletedRows; r++) + { + bitmap[r / 64] |= (1L << (r % 64)); + } + return bitmap; + } + + /** + * Creates a per-RG stats entry {@code {recordNum, invalidCount}} for one RG + * where exactly {@code deletedRows} out of {@code totalRows} rows are deleted. + * This mirrors what {@code runGC()} pre-computes during the Memory GC pass. + */ + private static long[] makeRgStats(int totalRows, int deletedRows) + { + return new long[]{totalRows, deletedRows}; + } + + // ----------------------------------------------------------------------- + // Stub: a StorageGarbageCollector subclass that bypasses MetadataService + // and lets callers inject a controlled list of fake files to scan. + // ----------------------------------------------------------------------- + + /** Represents a fake catalog file entry used by {@link DirectScanStorageGC}. */ + static class FakeFileEntry + { + final long fileId; + final int rgCount; + final long tableId; + final int virtualNodeId; + + FakeFileEntry(long fileId, int rgCount, long tableId, int virtualNodeId) + { + this.fileId = fileId; + this.rgCount = rgCount; + this.tableId = tableId; + this.virtualNodeId = virtualNodeId; + } + } + + /** + * StorageGarbageCollector subclass that replaces the metadata scan loop with + * a caller-supplied list of {@link FakeFileEntry} objects, while reusing the + * real invalidRatio computation and {@link StorageGarbageCollector#groupAndMerge} logic. + */ + static class DirectScanStorageGC extends StorageGarbageCollector + { + private final List fakeEntries; + + DirectScanStorageGC(RetinaResourceManager rm, double threshold, + int maxGroups, List fakeEntries) + { + super(rm, null, threshold, 134_217_728L, maxGroups); + this.fakeEntries = fakeEntries; + } + + @Override + List scanAndGroupFiles(Map rgStats) + { + List candidates = new ArrayList<>(); + for (FakeFileEntry entry : fakeEntries) + { + long totalRows = 0; + long invalidCount = 0; + for (int rgId = 0; rgId < entry.rgCount; rgId++) + { + String rgKey = entry.fileId + "_" + rgId; + long[] stats = rgStats.get(rgKey); + if (stats == null) continue; + totalRows += stats[0]; + invalidCount += stats[1]; + } + if (totalRows == 0) continue; + double ratio = (double) invalidCount / totalRows; + if (ratio > gcThreshold) + { + candidates.add(new FileCandidate( + makeFile(entry.fileId, entry.rgCount), + "fake_" + entry.fileId + "_0_" + entry.virtualNodeId + "_ordered.pxl", + entry.fileId, entry.rgCount, + entry.tableId, entry.virtualNodeId, + totalRows, ratio)); + } + } + return groupAndMerge(candidates); + } + } + + // ----------------------------------------------------------------------- + // Tests: groupAndMerge logic + // ----------------------------------------------------------------------- + + /** + * Three candidates from three distinct {@code (tableId, virtualNodeId)} pairs; + * expect three separate groups sorted by invalidRatio descending. + */ + @Test + public void testGroupAndMerge_threeDistinctGroups() + { + StorageGarbageCollector gc = new StorageGarbageCollector( + null, null, 0.5, 0L, 10); + + List candidates = Arrays.asList( + new StorageGarbageCollector.FileCandidate(makeFile(1, 1), "f1", 1, 1, 1L, 0, 100, 0.60), + new StorageGarbageCollector.FileCandidate(makeFile(2, 1), "f2", 2, 1, 1L, 1, 100, 0.70), + new StorageGarbageCollector.FileCandidate(makeFile(3, 1), "f3", 3, 1, 2L, 0, 100, 0.80) + ); + + List groups = gc.groupAndMerge(candidates); + + assertEquals("expected 3 groups", 3, groups.size()); + // sorted by avg invalidRatio desc: 0.80, 0.70, 0.60 + assertEquals("group 0 should have highest ratio (0.80)", 0.80, + groups.get(0).files.get(0).invalidRatio, 1e-9); + assertEquals("group 1 should have ratio 0.70", 0.70, + groups.get(1).files.get(0).invalidRatio, 1e-9); + assertEquals("group 2 should have lowest ratio (0.60)", 0.60, + groups.get(2).files.get(0).invalidRatio, 1e-9); + } + + /** + * Two candidates with the same {@code (tableId, virtualNodeId)} must be in one group, + * with files sorted by invalidRatio descending inside the group. + */ + @Test + public void testGroupAndMerge_twoFilesInSameGroup() + { + StorageGarbageCollector gc = new StorageGarbageCollector( + null, null, 0.5, 0L, 10); + + List candidates = Arrays.asList( + new StorageGarbageCollector.FileCandidate(makeFile(1, 1), "f1", 1, 1, 1L, 5, 100, 0.60), + new StorageGarbageCollector.FileCandidate(makeFile(2, 1), "f2", 2, 1, 1L, 5, 100, 0.80) + ); + + List groups = gc.groupAndMerge(candidates); + + assertEquals("both candidates should form one group", 1, groups.size()); + StorageGarbageCollector.FileGroup grp = groups.get(0); + assertEquals(1L, grp.tableId); + assertEquals(5, grp.virtualNodeId); + assertEquals(2, grp.files.size()); + assertTrue("files within group sorted by invalidRatio desc", + grp.files.get(0).invalidRatio >= grp.files.get(1).invalidRatio); + assertEquals(0.80, grp.files.get(0).invalidRatio, 1e-9); + assertEquals(0.60, grp.files.get(1).invalidRatio, 1e-9); + } + + /** + * When there are more candidate groups than {@code maxFileGroupsPerRun}, + * only the top-N groups (highest average invalidRatio) are returned. + */ + @Test + public void testGroupAndMerge_maxGroupsCap() + { + int max = 3; + StorageGarbageCollector gc = new StorageGarbageCollector( + null, null, 0.5, 0L, max); + + // Build 5 groups with different tableIds and clear invalidRatios (0.55..0.99) + List candidates = new ArrayList<>(); + for (int i = 0; i < 5; i++) + { + candidates.add(new StorageGarbageCollector.FileCandidate( + makeFile(i, 1), "f" + i, i, 1, (long) (i + 10), 0, 100, 0.55 + i * 0.11)); + } + + List groups = gc.groupAndMerge(candidates); + + assertEquals("groups must be capped at maxFileGroupsPerRun", max, groups.size()); + // First group must have the highest average invalidRatio + double firstAvg = groups.get(0).files.stream() + .mapToDouble(c -> c.invalidRatio).average().orElse(0); + double lastAvg = groups.get(groups.size() - 1).files.stream() + .mapToDouble(c -> c.invalidRatio).average().orElse(0); + assertTrue("groups must be sorted best-first", firstAvg >= lastAvg); + } + + /** + * An empty candidate list must produce an empty group list. + */ + @Test + public void testGroupAndMerge_emptyCandidates() + { + StorageGarbageCollector gc = new StorageGarbageCollector( + null, null, 0.5, 0L, 10); + List groups = + gc.groupAndMerge(Collections.emptyList()); + assertTrue("empty candidates → empty groups", groups.isEmpty()); + } + + // ----------------------------------------------------------------------- + // Tests: S1 threshold filtering (via DirectScanStorageGC) + // ----------------------------------------------------------------------- + + /** + * Three files with deletion ratios 60 %, 40 %, 80 % against threshold=0.5; + * only the 60 % and 80 % files should appear as candidates, and they must be + * grouped by {@code (tableId, virtualNodeId)}. + * + *

    This is the primary verification specified in the Task 2 doc: + * "threshold=0.5 → 60 % and 80 % selected, 40 % excluded". + */ + @Test + public void testScanAndGroupFiles_thresholdFiltering() + { + int totalRows = 100; + long fileId60 = 60001L; + long fileId40 = 40001L; + long fileId80 = 80001L; + + // Build rgStats directly: {recordNum, invalidCount} per RG. + // This simulates what runGC() pre-computes during the Memory GC pass. + Map rgStats = new HashMap<>(); + rgStats.put(fileId60 + "_0", makeRgStats(totalRows, 60)); // 60 % deleted + rgStats.put(fileId40 + "_0", makeRgStats(totalRows, 40)); // 40 % deleted + rgStats.put(fileId80 + "_0", makeRgStats(totalRows, 80)); // 80 % deleted + + List fakeFiles = Arrays.asList( + new FakeFileEntry(fileId60, 1, 1L, 0), // ratio=0.60, should be selected + new FakeFileEntry(fileId40, 1, 1L, 0), // ratio=0.40, should be excluded + new FakeFileEntry(fileId80, 1, 2L, 0) // ratio=0.80, should be selected (different table) + ); + + DirectScanStorageGC gc = new DirectScanStorageGC( + retinaManager, 0.5, 10, fakeFiles); + + List groups = gc.scanAndGroupFiles(rgStats); + + // 60 % and 80 % → 2 separate groups (different tableId: 1 and 2) + assertEquals("2 groups expected (60% and 80%)", 2, groups.size()); + + // Collect all selected fileIds + List selectedIds = new ArrayList<>(); + for (StorageGarbageCollector.FileGroup g : groups) + { + for (StorageGarbageCollector.FileCandidate c : g.files) + { + selectedIds.add(c.fileId); + } + } + + assertTrue("fileId60 should be selected", selectedIds.contains(fileId60)); + assertFalse("fileId40 should NOT be selected (ratio <= threshold)", selectedIds.contains(fileId40)); + assertTrue("fileId80 should be selected", selectedIds.contains(fileId80)); + } + + /** + * Two files sharing the same {@code (tableId, virtualNodeId)} must be in the same group. + */ + @Test + public void testScanAndGroupFiles_sameTableVNodeGroupedTogether() + { + int totalRows = 100; + long fileIdA = 70001L; + long fileIdB = 70002L; + + Map rgStats = new HashMap<>(); + rgStats.put(fileIdA + "_0", makeRgStats(totalRows, 60)); // 60 % + rgStats.put(fileIdB + "_0", makeRgStats(totalRows, 75)); // 75 % + + // Both files belong to same (tableId=5, vNodeId=3) + List fakeFiles = Arrays.asList( + new FakeFileEntry(fileIdA, 1, 5L, 3), + new FakeFileEntry(fileIdB, 1, 5L, 3) + ); + + DirectScanStorageGC gc = new DirectScanStorageGC( + retinaManager, 0.5, 10, fakeFiles); + + List groups = gc.scanAndGroupFiles(rgStats); + + assertEquals("both files share (table=5, vNode=3) → 1 group", 1, groups.size()); + assertEquals("group must contain 2 files", 2, groups.get(0).files.size()); + assertEquals(5L, groups.get(0).tableId); + assertEquals(3, groups.get(0).virtualNodeId); + } + + /** + * A file whose RG has no entry in {@code rgStats} must be skipped + * (totalRows == 0 → excluded regardless of threshold). + */ + @Test + public void testScanAndGroupFiles_skipsFilesWithNoVisibility() + { + long orphanFileId = 99999L; // no entry in rgStats → totalRows stays 0 + Map rgStats = new HashMap<>(); + + List fakeFiles = Collections.singletonList( + new FakeFileEntry(orphanFileId, 1, 1L, 0)); + + DirectScanStorageGC gc = new DirectScanStorageGC( + retinaManager, 0.5, 10, fakeFiles); + + List groups = gc.scanAndGroupFiles(rgStats); + assertTrue("file with no rgStats entry should be skipped", groups.isEmpty()); + } + + // ----------------------------------------------------------------------- + // Tests: processFileGroups bitmap trimming + // ----------------------------------------------------------------------- + + /** + * After {@code processFileGroups}, the {@code gcSnapshotBitmaps} map must contain + * only the RG keys belonging to candidate files. Non-candidate entries must be removed + * to release memory (the GC checkpoint has already been written with the full snapshot + * by {@code runGC()} before this point). + */ + @Test + public void testProcessFileGroups_trimsBitmapMapToCandidate() + { + long candidateFileId = 66001L; + long otherFileId = 66002L; + + Map bitmaps = new HashMap<>(); + bitmaps.put(candidateFileId + "_0", makeBitmap(100, 60)); + bitmaps.put(otherFileId + "_0", makeBitmap(100, 20)); // below threshold, not a candidate + + StorageGarbageCollector.FileCandidate candidate = + new StorageGarbageCollector.FileCandidate( + makeFile(candidateFileId, 1), "f_" + candidateFileId, + candidateFileId, 1, 1L, 0, 100, 0.60); + StorageGarbageCollector.FileGroup group = + new StorageGarbageCollector.FileGroup( + 1L, 0, Collections.singletonList(candidate)); + + StorageGarbageCollector gc = new StorageGarbageCollector( + null, null, 0.5, 134_217_728L, 10); + + gc.processFileGroups(Collections.singletonList(group), 300L, bitmaps); + + assertTrue("candidate RG key must be retained", + bitmaps.containsKey(candidateFileId + "_0")); + assertFalse("non-candidate RG key must be removed", + bitmaps.containsKey(otherFileId + "_0")); + } + + /** + * Files excluded by the {@code maxFileGroupsPerRun} cap must also have their bitmap + * entries removed: they were scanned but not selected for rewrite this cycle. + */ + @Test + public void testProcessFileGroups_trimsBitmapForCapExcludedFiles() + { + // Three files that would each form their own group, but maxGroups = 1 + long fileId1 = 67001L; // highest ratio → selected + long fileId2 = 67002L; // medium ratio → excluded by cap + long fileId3 = 67003L; // lowest ratio → excluded by cap + + Map bitmaps = new HashMap<>(); + bitmaps.put(fileId1 + "_0", makeBitmap(100, 80)); + bitmaps.put(fileId2 + "_0", makeBitmap(100, 70)); + bitmaps.put(fileId3 + "_0", makeBitmap(100, 60)); + + StorageGarbageCollector.FileCandidate c1 = new StorageGarbageCollector.FileCandidate( + makeFile(fileId1, 1), "f1", fileId1, 1, 10L, 0, 100, 0.80); + + // Only the top-1 group is passed to processFileGroups (cap=1 applied by groupAndMerge) + StorageGarbageCollector.FileGroup selectedGroup = + new StorageGarbageCollector.FileGroup(10L, 0, Collections.singletonList(c1)); + + StorageGarbageCollector gc = new StorageGarbageCollector( + null, null, 0.5, 134_217_728L, 1); + + gc.processFileGroups(Collections.singletonList(selectedGroup), 400L, bitmaps); + + assertTrue("selected file bitmap must be retained", bitmaps.containsKey(fileId1 + "_0")); + assertFalse("cap-excluded file2 bitmap must be removed", bitmaps.containsKey(fileId2 + "_0")); + assertFalse("cap-excluded file3 bitmap must be removed", bitmaps.containsKey(fileId3 + "_0")); + } + + // ----------------------------------------------------------------------- + // Tests: runStorageGC end-to-end (scan → process) + // ----------------------------------------------------------------------- + + /** + * When {@code scanAndGroupFiles} finds no candidates, {@code runStorageGC} must be a + * no-op: {@code gcSnapshotBitmaps} must remain unchanged. + */ + @Test + public void testRunStorageGC_noopWhenNoCandidates() + { + long fileId = 55001L; + + // Empty rgStats → totalRows = 0 for every fake file → no candidates + Map rgStats = new HashMap<>(); + + Map bitmaps = new HashMap<>(); + bitmaps.put(fileId + "_0", makeBitmap(100, 30)); + + DirectScanStorageGC gc = new DirectScanStorageGC( + retinaManager, 0.5, 10, + Collections.singletonList(new FakeFileEntry(fileId, 1, 1L, 0))); + + gc.runStorageGC(100L, rgStats, bitmaps); + + // No candidates → bitmaps must be untouched + assertTrue("bitmap must be unchanged when no candidates", + bitmaps.containsKey(fileId + "_0")); + assertEquals("bitmap map size must stay 1", 1, bitmaps.size()); + } + + /** + * When {@code runStorageGC} finds candidates, non-candidate bitmaps must be trimmed + * from {@code gcSnapshotBitmaps} while candidate bitmaps are retained for S2-S6. + */ + @Test + public void testRunStorageGC_trimsBitmapsForCandidates() + { + long candidateFileId = 56001L; // 70 % deleted → above threshold + long nonCandidateFileId = 56002L; // 20 % deleted → below threshold + + Map rgStats = new HashMap<>(); + rgStats.put(candidateFileId + "_0", makeRgStats(100, 70)); + rgStats.put(nonCandidateFileId + "_0", makeRgStats(100, 20)); + + Map bitmaps = new HashMap<>(); + bitmaps.put(candidateFileId + "_0", makeBitmap(100, 70)); + bitmaps.put(nonCandidateFileId + "_0", makeBitmap(100, 20)); + + DirectScanStorageGC gc = new DirectScanStorageGC( + retinaManager, 0.5, 10, + Arrays.asList( + new FakeFileEntry(candidateFileId, 1, 1L, 0), + new FakeFileEntry(nonCandidateFileId, 1, 1L, 0))); + + gc.runStorageGC(200L, rgStats, bitmaps); + + assertTrue("candidate bitmap must be retained for S2", + bitmaps.containsKey(candidateFileId + "_0")); + assertFalse("non-candidate bitmap must be trimmed", + bitmaps.containsKey(nonCandidateFileId + "_0")); + } + +} From 1146b38509786fe5fa4babefd7bd7164934ad564 Mon Sep 17 00:00:00 2001 From: Dongyang Geng Date: Sun, 29 Mar 2026 21:22:56 +0800 Subject: [PATCH 7/7] feat: unify constructors --- cpp/pixels-retina/include/RGVisibility.h | 4 +- cpp/pixels-retina/include/RGVisibilityJni.h | 10 +---- cpp/pixels-retina/include/TileVisibility.h | 16 +++---- cpp/pixels-retina/lib/RGVisibility.cpp | 40 ++++++----------- cpp/pixels-retina/lib/RGVisibilityJni.cpp | 45 +++++++------------ cpp/pixels-retina/lib/TileVisibility.cpp | 19 ++------ .../pixelsdb/pixels/retina/RGVisibility.java | 28 ++++++++---- 7 files changed, 65 insertions(+), 97 deletions(-) diff --git a/cpp/pixels-retina/include/RGVisibility.h b/cpp/pixels-retina/include/RGVisibility.h index 3f1ff1bcd..4252d0816 100644 --- a/cpp/pixels-retina/include/RGVisibility.h +++ b/cpp/pixels-retina/include/RGVisibility.h @@ -26,8 +26,8 @@ template class RGVisibility : public pixels::RetinaBase> { public: - explicit RGVisibility(uint64_t rgRecordNum); - explicit RGVisibility(uint64_t rgRecordNum, uint64_t timestamp, const std::vector& initialBitmap); + explicit RGVisibility(uint64_t rgRecordNum, uint64_t timestamp = 0, + const std::vector* initialBitmap = nullptr); ~RGVisibility() override; void deleteRGRecord(uint32_t rowId, uint64_t timestamp); diff --git a/cpp/pixels-retina/include/RGVisibilityJni.h b/cpp/pixels-retina/include/RGVisibilityJni.h index d4cbb372f..9cb40244e 100644 --- a/cpp/pixels-retina/include/RGVisibilityJni.h +++ b/cpp/pixels-retina/include/RGVisibilityJni.h @@ -10,17 +10,9 @@ extern "C" { /* * Class: io_pixelsdb_pixels_retina_RGVisibility * Method: createNativeObject - * Signature: (J)J - */ -JNIEXPORT jlong JNICALL Java_io_pixelsdb_pixels_retina_RGVisibility_createNativeObject - (JNIEnv *, jobject, jlong); - -/* - * Class: io_pixelsdb_pixels_retina_RGVisibility - * Method: createNativeObjectInitialized * Signature: (JJ[J)J */ -JNIEXPORT jlong JNICALL Java_io_pixelsdb_pixels_retina_RGVisibility_createNativeObjectInitialized +JNIEXPORT jlong JNICALL Java_io_pixelsdb_pixels_retina_RGVisibility_createNativeObject (JNIEnv *, jobject, jlong, jlong, jlongArray); /* diff --git a/cpp/pixels-retina/include/TileVisibility.h b/cpp/pixels-retina/include/TileVisibility.h index af299dcc7..3374ffd68 100644 --- a/cpp/pixels-retina/include/TileVisibility.h +++ b/cpp/pixels-retina/include/TileVisibility.h @@ -65,13 +65,13 @@ struct VersionedData : public pixels::RetinaBase> { uint64_t baseTimestamp; DeleteIndexBlock* head; // Delete chain head, part of the version - VersionedData() : baseTimestamp(0), head(nullptr) { - std::memset(baseBitmap, 0, sizeof(baseBitmap)); - } - - VersionedData(uint64_t ts, const uint64_t* bitmap, DeleteIndexBlock* h) + // timestamp defaults to 0; bitmap defaults to all-zeros. + explicit VersionedData(uint64_t ts = 0, const uint64_t* bitmap = nullptr, DeleteIndexBlock* h = nullptr) : baseTimestamp(ts), head(h) { - std::memcpy(baseBitmap, bitmap, NUM_WORDS * sizeof(uint64_t)); + if (bitmap) + std::memcpy(baseBitmap, bitmap, NUM_WORDS * sizeof(uint64_t)); + else + std::memset(baseBitmap, 0, sizeof(baseBitmap)); } }; @@ -92,8 +92,8 @@ template class TileVisibility : public pixels::RetinaBase> { static constexpr size_t NUM_WORDS = BITMAP_WORDS(CAPACITY); public: - TileVisibility(); - TileVisibility(uint64_t ts, const uint64_t* bitmap); + // timestamp defaults to 0; bitmap defaults to all-zeros. + explicit TileVisibility(uint64_t timestamp = 0, const uint64_t* bitmap = nullptr); ~TileVisibility() override; void deleteTileRecord(uint16_t rowId, uint64_t ts); void getTileVisibilityBitmap(uint64_t ts, uint64_t* outBitmap) const; diff --git a/cpp/pixels-retina/lib/RGVisibility.cpp b/cpp/pixels-retina/lib/RGVisibility.cpp index f6e33a175..f8861ac57 100644 --- a/cpp/pixels-retina/lib/RGVisibility.cpp +++ b/cpp/pixels-retina/lib/RGVisibility.cpp @@ -22,35 +22,23 @@ #include #include +// Validates before allocation: any throw leaves tileVisibilities as nullptr, +// so the incomplete constructor does not invoke the destructor (no memory leak). template -RGVisibility::RGVisibility(uint64_t rgRecordNum) - : tileCount((rgRecordNum + VISIBILITY_RECORD_CAPACITY - 1) / VISIBILITY_RECORD_CAPACITY) { - size_t allocSize = tileCount * sizeof(TileVisibility); - void* rawMemory = operator new[](allocSize); - tileVisibilities = static_cast*>(rawMemory); - for (uint64_t i = 0; i < tileCount; ++i) { - new (&tileVisibilities[i]) TileVisibility(); - } -} - -template -RGVisibility::RGVisibility(uint64_t rgRecordNum, uint64_t timestamp, const std::vector& initialBitmap) - : tileCount((rgRecordNum + VISIBILITY_RECORD_CAPACITY - 1) / VISIBILITY_RECORD_CAPACITY) { - size_t allocSize = tileCount * sizeof(TileVisibility); - void* rawMemory = operator new[](allocSize); +RGVisibility::RGVisibility(uint64_t rgRecordNum, uint64_t timestamp, + const std::vector* initialBitmap) + : tileCount((rgRecordNum + VISIBILITY_RECORD_CAPACITY - 1) / VISIBILITY_RECORD_CAPACITY), + tileVisibilities(nullptr) { + if (initialBitmap && initialBitmap->size() < tileCount * BITMAP_SIZE_PER_TILE_VISIBILITY) + throw std::invalid_argument("Initial bitmap size is too small for the given record number."); - if (initialBitmap.size() < tileCount * BITMAP_SIZE_PER_TILE_VISIBILITY) { - operator delete[](rawMemory); - throw std::runtime_error("Initial bitmap size is too small for the given record number."); - } + tileVisibilities = static_cast*>( + operator new[](tileCount * sizeof(TileVisibility))); - tileVisibilities = static_cast*>(rawMemory); - for (uint64_t i = 0; i < tileCount; ++i) { - // Each tile takes 4 uint64_t - const uint64_t* tileBitmap = &initialBitmap[i * BITMAP_SIZE_PER_TILE_VISIBILITY]; - // We use timestamp 0 for restored checkpoints to serve as the base state - new (&tileVisibilities[i]) TileVisibility(timestamp, tileBitmap); - } + for (uint64_t i = 0; i < tileCount; ++i) + new (&tileVisibilities[i]) TileVisibility( + timestamp, + initialBitmap ? initialBitmap->data() + i * BITMAP_SIZE_PER_TILE_VISIBILITY : nullptr); } template diff --git a/cpp/pixels-retina/lib/RGVisibilityJni.cpp b/cpp/pixels-retina/lib/RGVisibilityJni.cpp index 88a3ce7ea..4c8116ea6 100644 --- a/cpp/pixels-retina/lib/RGVisibilityJni.cpp +++ b/cpp/pixels-retina/lib/RGVisibilityJni.cpp @@ -26,40 +26,28 @@ /* * Class: io_pixelsdb_pixels_retina_RGVisibility * Method: createNativeObject - * Signature: (J)J - */ -JNIEXPORT jlong JNICALL Java_io_pixelsdb_pixels_retina_RGVisibility_createNativeObject - (JNIEnv* env, jobject, jlong rgRecordNum) { - try { - auto* rgVisibility = new RGVisibilityInstance(rgRecordNum); - return reinterpret_cast(rgVisibility); - } catch (const std::exception& e) { - env->ThrowNew(env->FindClass("java/lang/RuntimeException"), e.what()); - return 0; - } -} - -/* - * Class: io_pixelsdb_pixels_retina_RGVisibility - * Method: createNativeObjectInitialized * Signature: (JJ[J)J + * + * Converts the Java bitmap array to a native vector when present, then + * forwards to a single RGVisibility constructor call. */ -JNIEXPORT jlong JNICALL Java_io_pixelsdb_pixels_retina_RGVisibility_createNativeObjectInitialized +JNIEXPORT jlong JNICALL Java_io_pixelsdb_pixels_retina_RGVisibility_createNativeObject (JNIEnv* env, jobject, jlong rgRecordNum, jlong timestamp, jlongArray bitmap) { try { - jsize len = env->GetArrayLength(bitmap); - jlong *body = env->GetLongArrayElements(bitmap, nullptr); - std::vector bitmapData; - bitmapData.reserve(len); - for (int i = 0; i < len; i++) { - bitmapData.push_back((uint64_t)body[i]); + const std::vector* bitmapPtr = nullptr; + if (bitmap != nullptr) { + jsize len = env->GetArrayLength(bitmap); + jlong* body = env->GetLongArrayElements(bitmap, nullptr); + bitmapData.assign(reinterpret_cast(body), + reinterpret_cast(body) + len); + env->ReleaseLongArrayElements(bitmap, body, JNI_ABORT); + bitmapPtr = &bitmapData; } - - env->ReleaseLongArrayElements(bitmap, body, JNI_ABORT); - - RGVisibilityInstance *rgVisibility = new RGVisibilityInstance(rgRecordNum, timestamp, bitmapData); - return reinterpret_cast(rgVisibility); + return reinterpret_cast(new RGVisibilityInstance( + static_cast(rgRecordNum), + static_cast(timestamp), + bitmapPtr)); } catch (const std::exception& e) { env->ThrowNew(env->FindClass("java/lang/RuntimeException"), e.what()); return 0; @@ -195,6 +183,5 @@ JNIEXPORT jlong JNICALL Java_io_pixelsdb_pixels_retina_RGVisibility_getRetinaTra */ JNIEXPORT jlong JNICALL Java_io_pixelsdb_pixels_retina_RGVisibility_getRetinaObjectCount (JNIEnv *env, jclass clazz) { - // Read the atomic object counter from RetinaBase namespace return static_cast(pixels::g_retina_object_count.load(std::memory_order_relaxed)); } diff --git a/cpp/pixels-retina/lib/TileVisibility.cpp b/cpp/pixels-retina/lib/TileVisibility.cpp index 4e02e6605..dd40263c4 100644 --- a/cpp/pixels-retina/lib/TileVisibility.cpp +++ b/cpp/pixels-retina/lib/TileVisibility.cpp @@ -27,20 +27,9 @@ #include template -TileVisibility::TileVisibility() { - VersionedData* initialVersion = new VersionedData(); - currentVersion.store(initialVersion, std::memory_order_release); - tail.store(nullptr, std::memory_order_release); - tailUsed.store(0, std::memory_order_release); -} - -template -TileVisibility::TileVisibility(uint64_t ts, const uint64_t* bitmap) { - VersionedData* initialVersion = new VersionedData(ts, bitmap, nullptr); - currentVersion.store(initialVersion, std::memory_order_release); - tail.store(nullptr, std::memory_order_release); - tailUsed.store(0, std::memory_order_release); -} +TileVisibility::TileVisibility(uint64_t timestamp, const uint64_t* bitmap) + : currentVersion(new VersionedData(timestamp, bitmap)), + tail(nullptr), tailUsed(0) {} template TileVisibility::~TileVisibility() { @@ -392,4 +381,4 @@ void TileVisibility::reclaimRetiredVersions() { } // Explicit Instantiations (Add the sizes you need here) -template class TileVisibility; +template class TileVisibility; \ No newline at end of file diff --git a/pixels-retina/src/main/java/io/pixelsdb/pixels/retina/RGVisibility.java b/pixels-retina/src/main/java/io/pixelsdb/pixels/retina/RGVisibility.java index a2abad86c..83a105f67 100644 --- a/pixels-retina/src/main/java/io/pixelsdb/pixels/retina/RGVisibility.java +++ b/pixels-retina/src/main/java/io/pixelsdb/pixels/retina/RGVisibility.java @@ -68,19 +68,32 @@ public class RGVisibility implements AutoCloseable public RGVisibility(long rgRecordNum) { this.recordNum = rgRecordNum; - this.nativeHandle.set(createNativeObject(rgRecordNum)); + this.nativeHandle.set(createNativeObject(rgRecordNum, 0L, null)); } - public RGVisibility(long rgRecordNum, long timestamp, long[] initialBitmap) + public RGVisibility(long rgRecordNum, long timestamp) { + if (timestamp <= 0) + { + throw new IllegalArgumentException("timestamp must be positive when provided"); + } this.recordNum = rgRecordNum; - if (initialBitmap == null) + this.nativeHandle.set(createNativeObject(rgRecordNum, timestamp, null)); + } + + public RGVisibility(long rgRecordNum, long timestamp, long[] initialBitmap) + { + if (timestamp <= 0) { - this.nativeHandle.set(createNativeObject(rgRecordNum)); - } else + throw new IllegalArgumentException("timestamp must be positive when provided"); + } + + if (initialBitmap == null) { - this.nativeHandle.set(createNativeObjectInitialized(rgRecordNum, timestamp, initialBitmap)); + throw new IllegalArgumentException("initial bitmap must not be null"); } + this.recordNum = rgRecordNum; + this.nativeHandle.set(createNativeObject(rgRecordNum, timestamp, initialBitmap)); } public long getRecordNum() @@ -99,8 +112,7 @@ public void close() } // native methods - private native long createNativeObject(long rgRecordNum); - private native long createNativeObjectInitialized(long rgRecordNum, long timestamp, long[] bitmap); + private native long createNativeObject(long rgRecordNum, long timestamp, long[] bitmap); private native void destroyNativeObject(long nativeHandle); private native void deleteRecord(int rgRowOffset, long timestamp, long nativeHandle); private native long[] getVisibilityBitmap(long timestamp, long nativeHandle);