From 49983ec1597021cd5263bdc83420a2ba8c9a0c3c Mon Sep 17 00:00:00 2001 From: "yonghao.fyh" Date: Thu, 7 May 2026 11:12:01 +0800 Subject: [PATCH] [C++] Expose prefetch range planning via Reader::preBufferRange and refactor preBuffer to reuse it --- c++/include/orc/Reader.hh | 11 +++++++++++ c++/src/Reader.cc | 31 ++++++++++++++++++++++++++----- c++/src/Reader.hh | 3 +++ 3 files changed, 40 insertions(+), 5 deletions(-) diff --git a/c++/include/orc/Reader.hh b/c++/include/orc/Reader.hh index 5a88994536..122a781c93 100644 --- a/c++/include/orc/Reader.hh +++ b/c++/include/orc/Reader.hh @@ -32,6 +32,7 @@ #include #include #include +#include #include namespace orc { @@ -696,6 +697,16 @@ namespace orc { virtual void preBuffer(const std::vector& stripes, const std::list& includeTypes) = 0; + /** + * Calculate prefetch ranges by selected stripes and columns. + * It is thread safe and does not cache data. + * @param stripes the stripes to prefetch + * @param includeTypes the types to prefetch + * @return prefetch ranges as offset/length pairs + */ + virtual std::vector> preBufferRange( + const std::vector& stripes, const std::list& includeTypes) = 0; + /** * Release cached entries whose right boundary is less than or equal to the given boundary. * @param boundary the boundary value to release cache entries diff --git a/c++/src/Reader.cc b/c++/src/Reader.cc index 4fd1a73a95..511618e4ac 100644 --- a/c++/src/Reader.cc +++ b/c++/src/Reader.cc @@ -1770,8 +1770,8 @@ namespace orc { contents_->evictCache(boundary); } - void ReaderImpl::preBuffer(const std::vector& stripes, - const std::list& includeTypes) { + std::vector> ReaderImpl::preBufferRange( + const std::vector& stripes, const std::list& includeTypes) { std::vector newStripes; for (auto stripe : stripes) { if (stripe < static_cast(footer_->stripes_size())) newStripes.push_back(stripe); @@ -1783,7 +1783,7 @@ namespace orc { } if (newStripes.empty() || newIncludeTypes.empty()) { - return; + return {}; } orc::RowReaderOptions rowReaderOptions; @@ -1792,12 +1792,33 @@ namespace orc { std::vector selectedColumns; columnSelector.updateSelected(selectedColumns, rowReaderOptions); + std::vector> ranges; + for (auto stripe : newStripes) { const auto& stripeInfo = footer_->stripes(stripe); proto::StripeFooter stripeFooter = getStripeFooter(stripeInfo, *contents_); - auto ranges = extractReadRangesForStripe(stripe, stripeInfo, stripeFooter, selectedColumns); - contents_->cacheRanges(std::move(ranges)); + auto stripeRanges = + extractReadRangesForStripe(stripe, stripeInfo, stripeFooter, selectedColumns); + for (const auto& range : stripeRanges) { + ranges.emplace_back(range.offset, range.length); + } + } + return ranges; + } + + void ReaderImpl::preBuffer(const std::vector& stripes, + const std::list& includeTypes) { + auto ranges = preBufferRange(stripes, includeTypes); + if (ranges.empty()) { + return; + } + + std::vector readRanges; + readRanges.reserve(ranges.size()); + for (const auto& range : ranges) { + readRanges.emplace_back(range.first, range.second); } + contents_->cacheRanges(std::move(readRanges)); } RowReader::~RowReader() { diff --git a/c++/src/Reader.hh b/c++/src/Reader.hh index 132f92ebb5..204f678e96 100644 --- a/c++/src/Reader.hh +++ b/c++/src/Reader.hh @@ -410,6 +410,9 @@ namespace orc { std::map getBloomFilters( uint32_t stripeIndex, const std::set& included) const override; + std::vector> preBufferRange( + const std::vector& stripes, const std::list& includeTypes) override; + void preBuffer(const std::vector& stripes, const std::list& includeTypes) override; void releaseBuffer(uint64_t boundary) override;