44#include " xxhash.h"
55
66#include < string>
7- #include < string_view> // C++17: Essential for zero-copy parsing
7+ #include < string_view>
88#include < vector>
99#include < map>
1010#include < cstring>
@@ -78,7 +78,6 @@ struct MemoryMappedFile {
7878
7979class BBFReader {
8080private:
81- // Cached pointers to avoid recalculating offsets repeatedly
8281 const char * data_ptr = nullptr ;
8382 const BBFSection* sections_ = nullptr ;
8483 const BBFMetadata* meta_ = nullptr ;
@@ -108,8 +107,9 @@ class BBFReader {
108107 std::memcpy (&footer, data_ptr + mmap.size - sizeof (BBFFooter), sizeof (BBFFooter));
109108 if (std::memcmp (footer.magic , " BBF1" , 4 ) != 0 ) return ;
110109
111- // Cache Table Pointers
112- // Note: In production, you should add bounds checks here to ensure offsets are within mmap.size
110+ // Safety: Ensure tables point inside the file
111+ if (footer.assetTableOffset >= mmap.size || footer.pageTableOffset >= mmap.size ) return ;
112+
113113 sections_ = reinterpret_cast <const BBFSection*>(data_ptr + footer.sectionTableOffset );
114114 meta_ = reinterpret_cast <const BBFMetadata*>(data_ptr + footer.metaTableOffset );
115115 pages_ = reinterpret_cast <const BBFPageEntry*>(data_ptr + footer.pageTableOffset );
@@ -121,12 +121,8 @@ class BBFReader {
121121 isValid = true ;
122122 }
123123
124- // Optimized: Returns string_view (no allocation)
125- // Helper to allow returning std::string for legacy binding support if needed,
126- // but internal logic should prefer views.
127124 std::string_view getStringView (uint32_t offset) const {
128125 if (offset >= stringPoolSize_) return {};
129- // Requires strings in file to be null-terminated.
130126 return std::string_view (stringPool_ + offset);
131127 }
132128
@@ -140,9 +136,8 @@ class BBFReader {
140136 std::vector<PySection> result;
141137 if (!isValid) return result;
142138
143- result.reserve (footer.sectionCount ); // Optimization: Reserve memory
139+ result.reserve (footer.sectionCount );
144140 for (uint32_t i = 0 ; i < footer.sectionCount ; i++) {
145- // Explicit conversion to std::string here is okay as we are handing off to Python
146141 result.push_back ({
147142 std::string (getStringView (sections_[i].sectionTitleOffset )),
148143 sections_[i].sectionStartIndex ,
@@ -166,21 +161,14 @@ class BBFReader {
166161 return result;
167162 }
168163
169- // Zero-copy accessor for PyBind
170- // Returns {pointer, size}
171164 std::pair<const char *, size_t > getPageRaw (uint32_t pageIndex) const {
172165 if (!isValid || pageIndex >= footer.pageCount ) return {nullptr , 0 };
173166
174- // Indirect addressing: Page -> Asset -> Offset/Length
175167 const auto & asset = assets_[pages_[pageIndex].assetIndex ];
176- return { data_ptr + asset. offset , asset. length };
177- }
168+ // Bounds check on asset
169+ if (asset. offset + asset. length > mmap. size ) return { nullptr , 0 };
178170
179- // Legacy support (copies data)
180- std::string getPageBytes (uint32_t pageIndex) const {
181- auto raw = getPageRaw (pageIndex);
182- if (!raw.first ) return " " ;
183- return std::string (raw.first , raw.second );
171+ return { data_ptr + asset.offset , static_cast <size_t >(asset.length ) };
184172 }
185173
186174 std::map<std::string, uint64_t > getPageInfo (uint32_t pageIndex) const {
@@ -191,56 +179,62 @@ class BBFReader {
191179 {" length" , asset.length },
192180 {" offset" , asset.offset },
193181 {" hash" , asset.xxh3Hash },
194- {" type" , asset.type }
182+ {" type" , asset.type },
183+ {" decodedLength" , asset.decodedLength } // ADDED: v1.1 Spec
195184 };
196185 }
197186
198- bool verify () const {
199- if (!isValid) return false ;
187+ // Returns -1 for Success, -2 for Directory Error, or >=0 for Asset Index Error
188+ int64_t verify () const {
189+ if (!isValid) return -2 ;
200190
201191 // 1. Directory Hash Check
202192 size_t metaStart = footer.stringPoolOffset ;
203193 size_t metaSize = mmap.size - sizeof (BBFFooter) - metaStart;
204- if (XXH3_64bits (data_ptr + metaStart, metaSize) != footer.indexHash ) return false ;
194+ if (XXH3_64bits (data_ptr + metaStart, metaSize) != footer.indexHash ) return - 2 ;
205195
206196 // 2. Asset Integrity Check
207197 size_t count = footer.assetCount ;
208- const auto * local_assets = assets_; // Copy pointer for lambda capture
198+ const auto * local_assets = assets_;
209199 const auto * local_data = data_ptr;
200+ size_t max_size = mmap.size ;
210201
211- auto verifyRange = [local_assets, local_data](size_t start, size_t end) -> bool {
202+ // Lambda returns -1 if OK, or the index if Bad
203+ auto verifyRange = [local_assets, local_data, max_size](size_t start, size_t end) -> int64_t {
212204 for (size_t i = start; i < end; ++i) {
213205 const auto & a = local_assets[i];
206+ // Bounds check before hash
207+ if (a.offset + a.length > max_size) return (int64_t )i;
208+
214209 if (XXH3_64bits ((const uint8_t *)local_data + a.offset , a.length ) != a.xxh3Hash ) {
215- return false ;
210+ return ( int64_t )i; // Return the corrupted index
216211 }
217212 }
218- return true ;
213+ return - 1 ; // Success
219214 };
220215
221- // Optimization: Don't spawn threads for small files
222216 size_t numThreads = std::thread::hardware_concurrency ();
223217 if (numThreads == 0 ) numThreads = 1 ;
224218
225- // Heuristic: If assets < 128, threading overhead > hashing gain
226219 if (count < 128 || numThreads == 1 ) {
227220 return verifyRange (0 , count);
228221 }
229222
230223 size_t chunkSize = count / numThreads;
231- std::vector<std::future<bool >> futures;
224+ std::vector<std::future<int64_t >> futures; // Changed from bool to int64_t
232225 futures.reserve (numThreads);
233226
234227 for (size_t i = 0 ; i < numThreads; ++i) {
235228 size_t start = i * chunkSize;
236229 size_t end = (i == numThreads - 1 ) ? count : start + chunkSize;
237- // Launch async
238230 futures.push_back (std::async (std::launch::async, verifyRange, start, end));
239231 }
240232
233+ // Check results
241234 for (auto & f : futures) {
242- if (!f.get ()) return false ;
235+ int64_t result = f.get ();
236+ if (result != -1 ) return result; // Bubble up the error index
243237 }
244- return true ;
238+ return - 1 ; // All good
245239 }
246240};
0 commit comments