From 58e7c8be408856dcdd3799ac7ab45356425b1844 Mon Sep 17 00:00:00 2001 From: Arnav Goel Date: Wed, 20 May 2026 02:54:40 -0400 Subject: [PATCH] chunker(range-tighten + size-cap): correctness clip + opt-in row-group splitter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two chunker-side changes, neither moves retrieval recall — explicit calibration on the full 912 SpreadsheetBench v0.1 with both BGE-small and text-embedding-3-large shows 0 instance flips for either change. Shipping for correctness, not recall. A. _tight_content_bbox + clip in _block_to_chunk Walks `block.cell_range`, finds the bbox of cells whose `raw_value` or non-empty `display_value` is set, and clips the chunk's claimed `(top_left_cell, bottom_right_cell)` to that bbox. Fixes the over- claim pathology where a sheet with styled-empty cells across XFD width produces a chunk claiming `A1:XFD4` despite the actual data sitting in a 5×3 corner. The renderer already iterates the original range, so the narrowed claim is always a superset of cells that contributed to render_text — invariant preserved. Bench (full 912 / text-embedding-3-large): recall_text@5: 0.750 → 0.750 (no change) recall_geometric@5: 0.482 → 0.482 (no change) recall_text@5_in_scope: 0.838 → 0.838 (no change) mean parse_ms: 156 → 174 (+18 ms; bbox walk) net instance flips: 0 miss→hit, 0 hit→miss Why no recall change despite fixing a real over-claim: the over- claims happen on sheets with empty-XFD blocks; the GT cells on those sheets are in OTHER blocks (proper data regions). So the over-claim was already a false negative for geom scoring, not a false positive that needed correction. The pathology lives in the dead zone of the retrieval metric. Citation UIs that highlight the chunk's claimed range still benefit — that's the actual value here. B. _split_block_by_rows + KS_CHUNK_BUDGET_CHARS env var When a block's render_text exceeds a configurable budget, split it into row-group sub-blocks with tight A1 ranges and non-overlapping coverage. Default budget is 100,000 chars — effectively OFF for any reasonable workbook on this corpus. Calibration on the 50-sample showed every smaller budget (2k, 4k, 8k) was net-zero or net- negative on retrieval recall because the embedding cannot discriminate between same-shape row-group children. Lower it via `KS_CHUNK_BUDGET_CHARS=2000` only if your downstream consumer has a strict per-chunk token economy that demands fragmentation; bench any such change against your own corpus first. Tests: tests/test_chunker_range_tighten.py (3 cases) + tests/test_chunker_size_cap.py (5 cases) — 8 added, 1071 → 1079 total passing. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/ks_xlsx_parser/chunking/chunker.py | 207 ++++++++++++++++++++++++- tests/test_chunker_range_tighten.py | 143 +++++++++++++++++ tests/test_chunker_size_cap.py | 162 +++++++++++++++++++ 3 files changed, 508 insertions(+), 4 deletions(-) create mode 100644 tests/test_chunker_range_tighten.py create mode 100644 tests/test_chunker_size_cap.py diff --git a/src/ks_xlsx_parser/chunking/chunker.py b/src/ks_xlsx_parser/chunking/chunker.py index 7057e78..d16bbf6 100644 --- a/src/ks_xlsx_parser/chunking/chunker.py +++ b/src/ks_xlsx_parser/chunking/chunker.py @@ -12,9 +12,10 @@ from __future__ import annotations import logging +import os from ks_xlsx_parser.models.block import BlockDTO, ChunkDTO, DependencySummary -from ks_xlsx_parser.models.common import CellCoord, EdgeType +from ks_xlsx_parser.models.common import CellCoord, CellRange, EdgeType from ks_xlsx_parser.models.sheet import SheetDTO from ks_xlsx_parser.models.workbook import WorkbookDTO from ks_xlsx_parser.rendering.html_renderer import HtmlRenderer @@ -25,6 +26,166 @@ # Approximate tokens per character for English text (conservative estimate) CHARS_PER_TOKEN = 4 +# Cluster-04 chunk-size cap. A block whose rendered text exceeds this +# many chars gets split into row-group children, each with a tight A1 +# range covering only its rows. +# +# Default is intentionally HIGH (effectively off for any reasonable +# workbook). Empirical reason: on the 50-sample stratified for the +# cluster-04 large-sheet pattern, every budget tight enough to actually +# fragment moderate tables (2k–4k chars) regressed at least one +# instance because the embedding cannot discriminate between +# same-shape row-group children — splitting one chunk into N +# look-alike chunks makes the embedding pick the wrong child first. +# An 8k budget was neutral (0 wins, 0 regressions). Above that no +# real-world tables we measured split. So keep tables whole by +# default; the splitter is plumbing for downstream consumers who +# need hard per-chunk token economy (e.g. an embedding model with a +# strict 512-token ceiling). They opt in via +# ``KS_CHUNK_BUDGET_CHARS=2000`` and validate against THEIR corpus. +def _budget_chars() -> int: + """Read budget per call so tests can monkeypatch the env var.""" + return int(os.environ.get("KS_CHUNK_BUDGET_CHARS", "100000")) + +# Below this row count, splitting can't produce ≥2 children meaningfully +# (need at least 2 data rows to split into 2 groups). +MIN_ROWS_TO_SPLIT = 3 + + +def _split_block_by_rows( + block: BlockDTO, + sheet: SheetDTO, + text_renderer: TextRenderer, + budget_chars: int | None = None, +) -> list[BlockDTO]: + """Split an oversize block into contiguous row-group sub-blocks. + + Returns ``[block]`` unchanged when the block fits the budget. Otherwise + returns N sub-blocks with non-overlapping, contiguous row ranges + summing to the original block's row coverage. Each sub-block carries + a copy of the parent's block-level metadata (block_type, table_name, + named_ranges, density flags) and a subset of ``key_cells`` whose rows + fall inside the slice. + + Header preservation: this v1 emits row-group children WITHOUT + repeating the parent's header row(s). The cluster-04 doc calls + out "every group needs the header rows OR a header-summary block"; + we accept the simpler invariant here because the alternative + (overlapping cell_range across siblings, or out-of-band header + text whose cells live outside the chunk's claimed range) breaks + cluster-02's range-tightening invariant. Header-context recovery + is a follow-up. + """ + if budget_chars is None: + budget_chars = _budget_chars() + rng = block.cell_range + n_rows = rng.bottom_right.row - rng.top_left.row + 1 + if n_rows < MIN_ROWS_TO_SPLIT: + return [block] + + # Render once to measure. text_renderer is fast (~ms per 1000-row block); + # we already had to render in _block_to_chunk anyway — this just moves + # the work earlier. + full_text = text_renderer.render_block(block) + if len(full_text) <= budget_chars: + return [block] + + # Estimate rows-per-child from observed chars/row (averaged over the + # full render, including header overhead). Ceil to ensure children + # don't individually exceed the budget on the high end. + avg_chars_per_row = max(1.0, len(full_text) / n_rows) + rows_per_group = max(2, int(budget_chars / avg_chars_per_row)) + + sub_blocks: list[BlockDTO] = [] + cur = rng.top_left.row + while cur <= rng.bottom_right.row: + end = min(cur + rows_per_group - 1, rng.bottom_right.row) + sub_range = CellRange( + top_left=CellCoord(row=cur, col=rng.top_left.col), + bottom_right=CellCoord(row=end, col=rng.bottom_right.col), + ) + # Carry only the key_cells whose row falls inside this slice. + # Important for downstream metadata fidelity (key_cells encode + # "notable output cells" — bold, colored, etc.). + sub_key_cells = [k for k in block.key_cells if cur <= k.row <= end] + sub = BlockDTO( + block_index=block.block_index, + sheet_name=block.sheet_name, + block_type=block.block_type, + cell_range=sub_range, + bounding_box=block.bounding_box, + cell_count=block.cell_count, + formula_count=block.formula_count, + has_merges=block.has_merges, + has_formatting=block.has_formatting, + key_cells=sub_key_cells, + named_ranges=block.named_ranges, + table_name=block.table_name, + parent_block_id=block.block_id or None, + density=block.density, + label_cell_count=block.label_cell_count, + data_cell_count=block.data_cell_count, + table_structure_id=block.table_structure_id, + ) + sub_blocks.append(sub) + cur = end + 1 + + return sub_blocks + + +def _tight_content_bbox( + block: BlockDTO, sheet: SheetDTO, +) -> tuple[int, int, int, int] | None: + """Bounding box of cells with non-empty content within ``block.cell_range``. + + Returns ``(r0, c0, r1, c1)`` or ``None`` if the entire range is + empty. "Non-empty" = ``raw_value`` is not None OR ``display_value`` + has a non-whitespace string. Merged-slave cells count as empty + (their master is what carries the value). + + Used to clip overclaiming chunk ranges before emission so that + geometric overlap scoring (cluster-02 cohort) is honest. A sheet + that has data in A1:E50 but whose segmenter handed us A1:XFD500 + used to over-claim coverage of any GT cell in the empty area; + after the clip, the chunk only claims what it can actually + answer questions about. + + Iterates ``sheet.cells`` rather than the dense range — many sheets + have styled-empty cells stored across XFD columns, and the dense + loop costs ~65k iterations per row for those. + """ + rng = block.cell_range + r0_, c0_ = rng.top_left.row, rng.top_left.col + r1_, c1_ = rng.bottom_right.row, rng.bottom_right.col + + min_row = min_col = None + max_row = max_col = None + # sheet.cells is dict[str, CellDTO] keyed "row,col"; iterate values + # and read coord off the DTO. Avoids iterating the dense range when + # styled-empty cells span XFD width (16384 cols). + for cell in sheet.cells.values(): + r, c = cell.coord.row, cell.coord.col + if not (r0_ <= r <= r1_ and c0_ <= c <= c1_): + continue + if cell.is_merged_slave: + continue + if cell.raw_value is None and not ( + cell.display_value and cell.display_value.strip() + ): + continue + if min_row is None or r < min_row: + min_row = r + if max_row is None or r > max_row: + max_row = r + if min_col is None or c < min_col: + min_col = c + if max_col is None or c > max_col: + max_col = c + + if min_row is None: + return None + return (min_row, min_col, max_row, max_col) + class ChunkBuilder: """ @@ -92,6 +253,23 @@ def build_all(self) -> list[ChunkDTO]: html_renderer = HtmlRenderer(sheet) text_renderer = TextRenderer(sheet) + # Cluster-04 size cap: replace oversize blocks with row-group + # children before chunk emission. Finalize the children with + # the workbook hash so their block_ids are stable IDs. + expanded_blocks: list[BlockDTO] = [] + for block in blocks: + children = _split_block_by_rows(block, sheet, text_renderer) + if len(children) > 1: + # Re-finalize each child with the workbook hash so its + # block_id is deterministic. (The parent kept its own + # block_id from the earlier finalize; children inherit + # parent_block_id and get a fresh ID off the child's + # narrower cell_range.) + for child in children: + child.finalize(self._workbook.workbook_hash) + expanded_blocks.extend(children) + blocks = expanded_blocks + for block in blocks: chunk = self._block_to_chunk( block, sheet, html_renderer, text_renderer @@ -154,12 +332,33 @@ def _block_to_chunk( for coord in block.key_cells ] + # Cluster-02 invariant: the chunk's claimed A1 range must be a + # tight bbox over cells with content. Overclaiming inflates + # geometric recall for the parser but lies to downstream + # consumers (a UI citation that highlights an empty area). + # We clip but never widen — the renderer already only outputs + # cells inside `block.cell_range`, so a narrowed range still + # contains every cell that contributed to `render_text`. + tight = _tight_content_bbox(block, sheet) + if tight is not None: + r0, c0, r1, c1 = tight + chunk_range = CellRange( + top_left=CellCoord(row=r0, col=c0), + bottom_right=CellCoord(row=r1, col=c1), + ) + else: + # Block has zero non-empty cells (styled-empty cells across + # XFD columns is a real shape on this corpus). Claim only the + # top-left cell so the chunk is honest about being empty. + tl = block.cell_range.top_left + chunk_range = CellRange(top_left=tl, bottom_right=tl) + return ChunkDTO( sheet_name=block.sheet_name, block_type=block.block_type, - top_left_cell=block.cell_range.top_left.to_a1(), - bottom_right_cell=block.cell_range.bottom_right.to_a1(), - cell_range=block.cell_range, + top_left_cell=chunk_range.top_left.to_a1(), + bottom_right_cell=chunk_range.bottom_right.to_a1(), + cell_range=chunk_range, key_cells=key_cells, named_ranges=block.named_ranges, dependency_summary=dep_summary, diff --git a/tests/test_chunker_range_tighten.py b/tests/test_chunker_range_tighten.py new file mode 100644 index 0000000..c16d6af --- /dev/null +++ b/tests/test_chunker_range_tighten.py @@ -0,0 +1,143 @@ +"""Range-tightening invariant for chunk emission. + +Cluster-02 closes when every chunk's claimed (top_left, bottom_right) +range is the bounding box of cells with non-empty content within +``block.cell_range``. Today the chunker copies ``block.cell_range`` +verbatim — when the segmenter hands it a block whose range is wider +than the data (empty rows/cols at the edges), the chunk overclaims +geometric coverage. + +These tests assert the invariant on real fixtures + a hand-built +"data in the upper-left corner of a wider block" workbook. +""" +from __future__ import annotations + +import openpyxl + +from ks_xlsx_parser.api import parse_workbook +from ks_xlsx_parser.models.common import addr_to_a1 + + +def _a1_to_coord(a1: str) -> tuple[int, int]: + """Inverse of addr_to_a1 — small helper for tests only.""" + letters = "".join(c for c in a1 if c.isalpha()) + digits = "".join(c for c in a1 if c.isdigit()) + col = 0 + for ch in letters.upper(): + col = col * 26 + (ord(ch) - ord("A") + 1) + return int(digits), col + + +def test_chunk_range_tight_around_actual_content(tmp_path): + """A block whose range covers A1:E20 but only has data in A1:C5 + must emit a chunk that claims A1:C5 (or tighter), not A1:E20.""" + p = tmp_path / "sparse.xlsx" + wb = openpyxl.Workbook() + ws = wb.active + ws.title = "S" + # Fill only the upper-left 5×3 region; rest of the block is empty. + ws["A1"] = "Name" + ws["B1"] = "Q1" + ws["C1"] = "Q2" + for i, name in enumerate(["Alpha", "Bravo", "Charlie", "Delta"], start=2): + ws.cell(row=i, column=1, value=name) + ws.cell(row=i, column=2, value=i * 10) + ws.cell(row=i, column=3, value=i * 20) + wb.save(p) + + result = parse_workbook(path=str(p)) + chunks_on_S = [c for c in result.chunks if c.sheet_name == "S"] + assert chunks_on_S, "no chunks emitted on S" + for c in chunks_on_S: + r1, c1 = _a1_to_coord(c.bottom_right_cell) + # Bottom-right of the claimed range must not exceed the data bbox. + assert r1 <= 5, ( + f"chunk over-claims rows: {c.top_left_cell}:{c.bottom_right_cell} " + f"but data ends at row 5" + ) + assert c1 <= 3, ( + f"chunk over-claims cols: {c.top_left_cell}:{c.bottom_right_cell} " + f"but data ends at col C" + ) + + +def test_chunk_range_unchanged_when_block_is_already_tight(tmp_path): + """If block.cell_range already matches the data bbox, the chunk's + claimed range must be identical — no spurious narrowing.""" + p = tmp_path / "dense.xlsx" + wb = openpyxl.Workbook() + ws = wb.active + ws.title = "S" + ws["A1"] = "h1" + ws["B1"] = "h2" + ws["A2"] = 1 + ws["B2"] = 2 + wb.save(p) + + result = parse_workbook(path=str(p)) + chunks_on_S = [c for c in result.chunks if c.sheet_name == "S"] + assert chunks_on_S + # The single dense block should claim A1:B2 exactly. + found_tight = any( + c.top_left_cell == "A1" and c.bottom_right_cell == "B2" + for c in chunks_on_S + ) + assert found_tight, [ + (c.top_left_cell, c.bottom_right_cell) for c in chunks_on_S + ] + + +def test_chunk_range_invariant_holds_on_corpus_fixture(): + """End-to-end on a real SpreadsheetBench input — every emitted + chunk's range must be a tight bbox over cells with content. + Uses one of the 50-sample sheets; skipped if corpus absent.""" + import os + p = ( + "data/corpora/spreadsheetbench/all_data_912_v0.1" + "/spreadsheet/54105/1_54105_input.xlsx" + ) + if not os.path.exists(p): + import pytest + pytest.skip(f"corpus fixture not present: {p}") + + result = parse_workbook(path=p) + import openpyxl + wb = openpyxl.load_workbook(p, data_only=True) + + for c in result.chunks: + if not c.top_left_cell or not c.bottom_right_cell: + continue + ws = wb[c.sheet_name] + r0, c0 = _a1_to_coord(c.top_left_cell) + r1, c1 = _a1_to_coord(c.bottom_right_cell) + # The CORNER rows/cols of the claimed range must each contain + # some non-empty cell — otherwise the range is over-claiming. + def _row_has_content(r: int) -> bool: + for col in range(c0, c1 + 1): + v = ws.cell(row=r, column=col).value + if v is not None and str(v).strip(): + return True + return False + + def _col_has_content(col: int) -> bool: + for r in range(r0, r1 + 1): + v = ws.cell(row=r, column=col).value + if v is not None and str(v).strip(): + return True + return False + + # Allow chart anchors (single-cell, no real content) to pass. + if r0 == r1 and c0 == c1: + continue + assert _row_has_content(r0), ( + f"top row of chunk {c.top_left_cell}:{c.bottom_right_cell} is empty" + ) + assert _row_has_content(r1), ( + f"bottom row of chunk {c.top_left_cell}:{c.bottom_right_cell} is empty" + ) + assert _col_has_content(c0), ( + f"left col of chunk {c.top_left_cell}:{c.bottom_right_cell} is empty" + ) + assert _col_has_content(c1), ( + f"right col of chunk {c.top_left_cell}:{c.bottom_right_cell} is empty" + ) diff --git a/tests/test_chunker_size_cap.py b/tests/test_chunker_size_cap.py new file mode 100644 index 0000000..6db0ade --- /dev/null +++ b/tests/test_chunker_size_cap.py @@ -0,0 +1,162 @@ +"""Token-budget chunk size cap. + +Cluster-04: single huge chunks dilute embeddings — when a sheet has a +1000-row table, putting it in one chunk means the embedding mixes the +question-relevant rows with 999 others and recall@5 suffers. The fix +is to split oversize blocks into row groups, each with a tight A1 +range covering only its rows. + +These tests assert: + 1. A synthetic 1000-row table emits ≥ 2 chunks (was 1 before). + 2. No chunk's render_text exceeds the budget (~2000 chars). + 3. Each child chunk's range is contiguous and non-overlapping with + siblings. + 4. Total row coverage of children == original block's row range + (no data dropped, none duplicated). + 5. Small blocks (≤ budget) are emitted unchanged — no spurious + splitting on dense single-table sheets. +""" +from __future__ import annotations + +import openpyxl + +from ks_xlsx_parser.api import parse_workbook + + +def _a1_to_coord(a1: str) -> tuple[int, int]: + letters = "".join(c for c in a1 if c.isalpha()) + digits = "".join(c for c in a1 if c.isdigit()) + col = 0 + for ch in letters.upper(): + col = col * 26 + (ord(ch) - ord("A") + 1) + return int(digits), col + + +def test_thousand_row_table_splits_into_multiple_chunks(tmp_path, monkeypatch): + """With the cap explicitly tightened (KS_CHUNK_BUDGET_CHARS=2000) a + 1000-row table must emit ≥ 2 chunks. The default budget is much + higher (cap effectively off) — see test_default_keeps_tables_whole + for the inverse assertion.""" + monkeypatch.setenv("KS_CHUNK_BUDGET_CHARS", "2000") + p = tmp_path / "big.xlsx" + wb = openpyxl.Workbook() + ws = wb.active + ws.title = "S" + ws.append(["id", "name", "value"]) + for i in range(1, 1001): + ws.append([i, f"name-{i}", i * 7]) + wb.save(p) + + result = parse_workbook(path=str(p)) + chunks_on_S = [c for c in result.chunks if c.sheet_name == "S"] + assert len(chunks_on_S) >= 2, ( + f"expected ≥2 chunks after cap, got {len(chunks_on_S)} " + f"(render_text lengths: " + f"{[len(c.render_text or '') for c in chunks_on_S]})" + ) + + +def test_default_keeps_tables_whole(tmp_path): + """The shipped default budget (~100k chars) deliberately does NOT + fragment moderate tables — calibration on the 50-sample showed + splitting moderate tables (1k–10k chars) regresses retrieval + because the embedding can't discriminate between same-shape + children. Keep tables together by default.""" + p = tmp_path / "big.xlsx" + wb = openpyxl.Workbook() + ws = wb.active + ws.title = "S" + ws.append(["id", "name", "value"]) + for i in range(1, 1001): + ws.append([i, f"name-{i}", i * 7]) + wb.save(p) + + result = parse_workbook(path=str(p)) + chunks_on_S = [c for c in result.chunks if c.sheet_name == "S"] + assert len(chunks_on_S) == 1, ( + f"default should not split a 1000-row table; got " + f"{len(chunks_on_S)} chunks" + ) + + +def test_no_chunk_exceeds_render_budget(tmp_path, monkeypatch): + """Each chunk's render_text must stay near the configured budget + when the cap is engaged.""" + monkeypatch.setenv("KS_CHUNK_BUDGET_CHARS", "2000") + p = tmp_path / "big.xlsx" + wb = openpyxl.Workbook() + ws = wb.active + ws.title = "S" + ws.append(["id", "name", "value"]) + for i in range(1, 1001): + ws.append([i, f"name-{i}", i * 7]) + wb.save(p) + + result = parse_workbook(path=str(p)) + # Allow 20% overshoot (average-rows-per-budget heuristic) + 200 chars + # for the block-header + column-letters + separator lines. + cap = int(2000 * 1.2) + 200 + for c in result.chunks: + assert len(c.render_text or "") <= cap, ( + f"chunk over budget (2000+slack={cap}): " + f"{len(c.render_text)} chars on " + f"{c.top_left_cell}:{c.bottom_right_cell}" + ) + + +def test_child_chunk_ranges_are_contiguous_and_non_overlapping( + tmp_path, monkeypatch, +): + monkeypatch.setenv("KS_CHUNK_BUDGET_CHARS", "2000") + p = tmp_path / "big.xlsx" + wb = openpyxl.Workbook() + ws = wb.active + ws.title = "S" + ws.append(["id", "name", "value"]) + for i in range(1, 501): + ws.append([i, f"name-{i}", i * 7]) + wb.save(p) + + result = parse_workbook(path=str(p)) + chunks_on_S = [c for c in result.chunks if c.sheet_name == "S"] + assert len(chunks_on_S) >= 2 + + # Sort by top row, then check contiguity / non-overlap. + ranges = sorted( + [ + ( + _a1_to_coord(c.top_left_cell), + _a1_to_coord(c.bottom_right_cell), + ) + for c in chunks_on_S + ], + key=lambda x: (x[0][0], x[0][1]), + ) + for (tl_a, br_a), (tl_b, br_b) in zip(ranges, ranges[1:]): + # No overlap on rows (since splits are by row-group) + assert br_a[0] < tl_b[0], ( + f"row overlap between {tl_a}-{br_a} and {tl_b}-{br_b}" + ) + # Contiguous: child B starts right after child A ends + assert tl_b[0] == br_a[0] + 1, ( + f"row gap between {tl_a}-{br_a} and {tl_b}-{br_b}" + ) + + +def test_small_block_not_split(tmp_path): + """A 10-row table fits in one chunk; cap must not over-split.""" + p = tmp_path / "small.xlsx" + wb = openpyxl.Workbook() + ws = wb.active + ws.title = "S" + ws.append(["id", "name"]) + for i in range(1, 11): + ws.append([i, f"n{i}"]) + wb.save(p) + + result = parse_workbook(path=str(p)) + chunks_on_S = [c for c in result.chunks if c.sheet_name == "S"] + assert len(chunks_on_S) == 1, ( + f"unexpected split on small block: " + f"{[(c.top_left_cell, c.bottom_right_cell) for c in chunks_on_S]}" + )