-
Notifications
You must be signed in to change notification settings - Fork 550
[client] Fix tiering hang on first_row merge engine empty batches #3242
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
d3984d1
470aee0
6e38765
5266654
0eeb3a7
656b27c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -142,7 +142,8 @@ public ScanRecords poll(Duration timeout) { | |
| long startNanos = System.nanoTime(); | ||
| do { | ||
| ScanRecords scanRecords = pollForFetches(); | ||
| if (scanRecords.isEmpty()) { | ||
| // Gate on buckets() rather than isEmpty() so progress-only polls reach the caller. | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. curious about why we need to change this? Is there any problem if not change this?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes. |
||
| if (scanRecords.buckets().isEmpty()) { | ||
| try { | ||
| if (!logFetcher.awaitNotEmpty(startNanos + timeoutNanos)) { | ||
| // logFetcher waits for the timeout and no data in buffer, | ||
|
|
@@ -249,7 +250,8 @@ public void wakeup() { | |
|
|
||
| private ScanRecords pollForFetches() { | ||
| ScanRecords scanRecords = logFetcher.collectFetch(); | ||
| if (!scanRecords.isEmpty()) { | ||
| // Check buckets() (includes progress-only buckets). | ||
| if (!scanRecords.buckets().isEmpty()) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. dito. Why do we need to change it.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same reason as above. |
||
| return scanRecords; | ||
| } | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -355,63 +355,89 @@ private RecordsWithSplitIds<TableBucketWriteResult<WriteResult>> forLogRecords( | |
| Map<TableBucket, TableBucketWriteResult<WriteResult>> writeResults = new HashMap<>(); | ||
| Map<TableBucket, String> finishedSplitIds = new HashMap<>(); | ||
|
|
||
| // Iterate every polled bucket, including those that only advanced their offset. | ||
| for (TableBucket bucket : scanRecords.buckets()) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. sorry for revisit it again. I still found it hard to understand, I'm thinking change like it, WDTY?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You're right, I've made the changes. |
||
| List<ScanRecord> bucketScanRecords = scanRecords.records(bucket); | ||
| if (bucketScanRecords.isEmpty()) { | ||
| continue; | ||
| } | ||
| // no any stopping offset, just skip handle the records for the bucket | ||
| Long stoppingOffset = currentTableStoppingOffsets.get(bucket); | ||
| if (stoppingOffset == null) { | ||
| continue; | ||
| } | ||
|
|
||
| List<ScanRecord> records = scanRecords.records(bucket); | ||
| LakeWriter<WriteResult> lakeWriter = null; | ||
| for (ScanRecord record : bucketScanRecords) { | ||
| // if record is less than stopping offset | ||
| if (record.logOffset() < stoppingOffset) { | ||
| if (lakeWriter == null) { | ||
| lakeWriter = | ||
| getOrCreateLakeWriter( | ||
| bucket, | ||
| currentTableSplitsByBucket.get(bucket).getPartitionName()); | ||
| } | ||
| lakeWriter.write(record); | ||
| if (record.getSizeInBytes() > 0) { | ||
| tieringMetrics.recordBytesRead(record.getSizeInBytes()); | ||
| } | ||
| ScanRecord lastRecord = null; | ||
|
|
||
| for (ScanRecord record : records) { | ||
| lastRecord = record; | ||
|
|
||
| // The scanner may return records beyond this split's exclusive stopping offset. | ||
| // Those records belong to the next split and must not be tiered here. | ||
| if (record.logOffset() >= stoppingOffset) { | ||
| continue; | ||
| } | ||
|
|
||
| if (lakeWriter == null) { | ||
| lakeWriter = | ||
| getOrCreateLakeWriter( | ||
| bucket, | ||
| currentTableSplitsByBucket.get(bucket).getPartitionName()); | ||
| } | ||
| lakeWriter.write(record); | ||
| if (record.getSizeInBytes() > 0) { | ||
| tieringMetrics.recordBytesRead(record.getSizeInBytes()); | ||
| } | ||
| } | ||
|
|
||
| // consumedUpToOffset is an exclusive upper bound: all offsets before it have been | ||
| // consumed by the scanner in this poll round. It may advance even when records is | ||
| // empty, for example when FIRST_ROW filters duplicate upserts into empty WAL batches. | ||
| Long consumedUpToOffset = scanRecords.consumedUpToOffset(bucket); | ||
| checkState( | ||
| consumedUpToOffset != null, | ||
| "Missing consumed-up-to offset for polled bucket %s.", | ||
| bucket); | ||
|
|
||
| // The split owns offsets before stoppingOffset only. If the scanner consumed past | ||
| // the split boundary, cap the tiered progress at stoppingOffset so the next split | ||
| // still owns later data. | ||
| long tieredLogEndOffset = Math.min(consumedUpToOffset, stoppingOffset); | ||
| long tieredTimestamp; | ||
| if (lastRecord != null) { | ||
| tieredTimestamp = lastRecord.timestamp(); | ||
| } else { | ||
| LogOffsetAndTimestamp latest = currentTableTieredOffsetAndTimestamp.get(bucket); | ||
| tieredTimestamp = latest != null ? latest.timestamp : UNKNOWN_BUCKET_TIMESTAMP; | ||
| } | ||
| ScanRecord lastRecord = bucketScanRecords.get(bucketScanRecords.size() - 1); | ||
| currentTableTieredOffsetAndTimestamp.put( | ||
| bucket, | ||
| new LogOffsetAndTimestamp(lastRecord.logOffset(), lastRecord.timestamp())); | ||
| // has arrived into the end of the split, | ||
| if (lastRecord.logOffset() >= stoppingOffset - 1) { | ||
| currentTableStoppingOffsets.remove(bucket); | ||
| if (bucket.getPartitionId() != null) { | ||
| currentLogScanner.unsubscribe(bucket.getPartitionId(), bucket.getBucket()); | ||
| } else { | ||
| // todo: should unsubscribe the log split if unsubscribe bucket for | ||
| // un-partitioned table is supported | ||
| } | ||
| TieringSplit currentTieringSplit = currentTableSplitsByBucket.remove(bucket); | ||
| String currentSplitId = currentTieringSplit.splitId(); | ||
| // put write result of the bucket | ||
| writeResults.put( | ||
| bucket, | ||
| completeLakeWriter( | ||
| bucket, | ||
| currentTieringSplit.getPartitionName(), | ||
| stoppingOffset, | ||
| lastRecord.timestamp())); | ||
| // put split of the bucket | ||
| finishedSplitIds.put(bucket, currentSplitId); | ||
| LOG.info( | ||
| "Finish tier bucket {} for table {}, split: {}.", | ||
| bucket, | ||
| currentTablePath, | ||
| currentSplitId); | ||
| bucket, new LogOffsetAndTimestamp(tieredLogEndOffset - 1, tieredTimestamp)); | ||
|
|
||
| // The split owns offsets below stoppingOffset. If the scanner has not consumed up to | ||
| // that exclusive bound yet, keep the split active. | ||
| if (consumedUpToOffset < stoppingOffset) { | ||
| continue; | ||
| } | ||
|
|
||
| currentTableStoppingOffsets.remove(bucket); | ||
| if (bucket.getPartitionId() != null) { | ||
| currentLogScanner.unsubscribe(bucket.getPartitionId(), bucket.getBucket()); | ||
| } else { | ||
| // todo: should unsubscribe the log split if unsubscribe bucket for | ||
| // un-partitioned table is supported | ||
| } | ||
| TieringSplit currentTieringSplit = currentTableSplitsByBucket.remove(bucket); | ||
| String currentSplitId = currentTieringSplit.splitId(); | ||
| writeResults.put( | ||
| bucket, | ||
| completeLakeWriter( | ||
| bucket, | ||
| currentTieringSplit.getPartitionName(), | ||
| stoppingOffset, | ||
| tieredTimestamp)); | ||
| finishedSplitIds.put(bucket, currentSplitId); | ||
| LOG.info( | ||
| "Finish tier bucket {} for table {}, split: {}.", | ||
| bucket, | ||
| currentTablePath, | ||
| currentSplitId); | ||
| } | ||
|
|
||
| if (!finishedSplitIds.isEmpty()) { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why remove this?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That NOTE is no longer true after this PR. Previously, empty records meant consumed position was not updated. Now collectFetch always records the offset via consumedUpToOffsets even when the materialized record list is empty (e.g. FIRST_ROW empty WAL batches). Keeping a guarantee the code no longer honors would be misleading.