knowledgestack · arnav2 · May 20, 2026 · May 20, 2026
diff --git a/src/ks_xlsx_parser/chunking/chunker.py b/src/ks_xlsx_parser/chunking/chunker.py
@@ -12,9 +12,10 @@
 from __future__ import annotations
 
 import logging
+import os
 
 from ks_xlsx_parser.models.block import BlockDTO, ChunkDTO, DependencySummary
-from ks_xlsx_parser.models.common import CellCoord, EdgeType
+from ks_xlsx_parser.models.common import CellCoord, CellRange, EdgeType
 from ks_xlsx_parser.models.sheet import SheetDTO
 from ks_xlsx_parser.models.workbook import WorkbookDTO
 from ks_xlsx_parser.rendering.html_renderer import HtmlRenderer
@@ -25,6 +26,166 @@
 # Approximate tokens per character for English text (conservative estimate)
 CHARS_PER_TOKEN = 4
 
+# Cluster-04 chunk-size cap. A block whose rendered text exceeds this
+# many chars gets split into row-group children, each with a tight A1
+# range covering only its rows.
+#
+# Default is intentionally HIGH (effectively off for any reasonable
+# workbook). Empirical reason: on the 50-sample stratified for the
+# cluster-04 large-sheet pattern, every budget tight enough to actually
+# fragment moderate tables (2k–4k chars) regressed at least one
+# instance because the embedding cannot discriminate between
+# same-shape row-group children — splitting one chunk into N
+# look-alike chunks makes the embedding pick the wrong child first.
+# An 8k budget was neutral (0 wins, 0 regressions). Above that no
+# real-world tables we measured split. So keep tables whole by
+# default; the splitter is plumbing for downstream consumers who
+# need hard per-chunk token economy (e.g. an embedding model with a
+# strict 512-token ceiling). They opt in via
+# ``KS_CHUNK_BUDGET_CHARS=2000`` and validate against THEIR corpus.
+def _budget_chars() -> int:
+    """Read budget per call so tests can monkeypatch the env var."""
+    return int(os.environ.get("KS_CHUNK_BUDGET_CHARS", "100000"))
+
+# Below this row count, splitting can't produce ≥2 children meaningfully
+# (need at least 2 data rows to split into 2 groups).
+MIN_ROWS_TO_SPLIT = 3
+
+
+def _split_block_by_rows(
+    block: BlockDTO,
+    sheet: SheetDTO,
+    text_renderer: TextRenderer,
+    budget_chars: int | None = None,
+) -> list[BlockDTO]:
+    """Split an oversize block into contiguous row-group sub-blocks.
+
+    Returns ``[block]`` unchanged when the block fits the budget. Otherwise
+    returns N sub-blocks with non-overlapping, contiguous row ranges
+    summing to the original block's row coverage. Each sub-block carries
+    a copy of the parent's block-level metadata (block_type, table_name,
+    named_ranges, density flags) and a subset of ``key_cells`` whose rows
+    fall inside the slice.
+
+    Header preservation: this v1 emits row-group children WITHOUT
+    repeating the parent's header row(s). The cluster-04 doc calls
+    out "every group needs the header rows OR a header-summary block";
+    we accept the simpler invariant here because the alternative
+    (overlapping cell_range across siblings, or out-of-band header
+    text whose cells live outside the chunk's claimed range) breaks
+    cluster-02's range-tightening invariant. Header-context recovery
+    is a follow-up.
+    """
+    if budget_chars is None:
+        budget_chars = _budget_chars()
+    rng = block.cell_range
+    n_rows = rng.bottom_right.row - rng.top_left.row + 1
+    if n_rows < MIN_ROWS_TO_SPLIT:
+        return [block]
+
+    # Render once to measure. text_renderer is fast (~ms per 1000-row block);
+    # we already had to render in _block_to_chunk anyway — this just moves
+    # the work earlier.
+    full_text = text_renderer.render_block(block)
+    if len(full_text) <= budget_chars:
+        return [block]
+
+    # Estimate rows-per-child from observed chars/row (averaged over the
+    # full render, including header overhead). Ceil to ensure children
+    # don't individually exceed the budget on the high end.
+    avg_chars_per_row = max(1.0, len(full_text) / n_rows)
+    rows_per_group = max(2, int(budget_chars / avg_chars_per_row))
+
+    sub_blocks: list[BlockDTO] = []
+    cur = rng.top_left.row
+    while cur <= rng.bottom_right.row:
+        end = min(cur + rows_per_group - 1, rng.bottom_right.row)
+        sub_range = CellRange(
+            top_left=CellCoord(row=cur, col=rng.top_left.col),
+            bottom_right=CellCoord(row=end, col=rng.bottom_right.col),
+        )
+        # Carry only the key_cells whose row falls inside this slice.
+        # Important for downstream metadata fidelity (key_cells encode
+        # "notable output cells" — bold, colored, etc.).
+        sub_key_cells = [k for k in block.key_cells if cur <= k.row <= end]
+        sub = BlockDTO(
+            block_index=block.block_index,
+            sheet_name=block.sheet_name,
+            block_type=block.block_type,
+            cell_range=sub_range,
+            bounding_box=block.bounding_box,
+            cell_count=block.cell_count,
+            formula_count=block.formula_count,
+            has_merges=block.has_merges,
+            has_formatting=block.has_formatting,
+            key_cells=sub_key_cells,
+            named_ranges=block.named_ranges,
+            table_name=block.table_name,
+            parent_block_id=block.block_id or None,
+            density=block.density,
+            label_cell_count=block.label_cell_count,
+            data_cell_count=block.data_cell_count,
+            table_structure_id=block.table_structure_id,
+        )
+        sub_blocks.append(sub)
+        cur = end + 1
+
+    return sub_blocks
+
+
+def _tight_content_bbox(
+    block: BlockDTO, sheet: SheetDTO,
+) -> tuple[int, int, int, int] | None:
+    """Bounding box of cells with non-empty content within ``block.cell_range``.
+
+    Returns ``(r0, c0, r1, c1)`` or ``None`` if the entire range is
+    empty. "Non-empty" = ``raw_value`` is not None OR ``display_value``
+    has a non-whitespace string. Merged-slave cells count as empty
+    (their master is what carries the value).
+
+    Used to clip overclaiming chunk ranges before emission so that
+    geometric overlap scoring (cluster-02 cohort) is honest. A sheet
+    that has data in A1:E50 but whose segmenter handed us A1:XFD500
+    used to over-claim coverage of any GT cell in the empty area;
+    after the clip, the chunk only claims what it can actually
+    answer questions about.
+
+    Iterates ``sheet.cells`` rather than the dense range — many sheets
+    have styled-empty cells stored across XFD columns, and the dense
+    loop costs ~65k iterations per row for those.
+    """
+    rng = block.cell_range
+    r0_, c0_ = rng.top_left.row, rng.top_left.col
+    r1_, c1_ = rng.bottom_right.row, rng.bottom_right.col
+
+    min_row = min_col = None
+    max_row = max_col = None
+    # sheet.cells is dict[str, CellDTO] keyed "row,col"; iterate values
+    # and read coord off the DTO. Avoids iterating the dense range when
+    # styled-empty cells span XFD width (16384 cols).
+    for cell in sheet.cells.values():
+        r, c = cell.coord.row, cell.coord.col
+        if not (r0_ <= r <= r1_ and c0_ <= c <= c1_):
+            continue
+        if cell.is_merged_slave:
+            continue
+        if cell.raw_value is None and not (
+            cell.display_value and cell.display_value.strip()
+        ):
+            continue
+        if min_row is None or r < min_row:
+            min_row = r
+        if max_row is None or r > max_row:
+            max_row = r
+        if min_col is None or c < min_col:
+            min_col = c
+        if max_col is None or c > max_col:
+            max_col = c
+
+    if min_row is None:
+        return None
+    return (min_row, min_col, max_row, max_col)
+
 
 class ChunkBuilder:
     """
@@ -92,6 +253,23 @@ def build_all(self) -> list[ChunkDTO]:
             html_renderer = HtmlRenderer(sheet)
             text_renderer = TextRenderer(sheet)
 
+            # Cluster-04 size cap: replace oversize blocks with row-group
+            # children before chunk emission. Finalize the children with
+            # the workbook hash so their block_ids are stable IDs.
+            expanded_blocks: list[BlockDTO] = []
+            for block in blocks:
+                children = _split_block_by_rows(block, sheet, text_renderer)
+                if len(children) > 1:
+                    # Re-finalize each child with the workbook hash so its
+                    # block_id is deterministic. (The parent kept its own
+                    # block_id from the earlier finalize; children inherit
+                    # parent_block_id and get a fresh ID off the child's
+                    # narrower cell_range.)
+                    for child in children:
+                        child.finalize(self._workbook.workbook_hash)
+                expanded_blocks.extend(children)
+            blocks = expanded_blocks
+
             for block in blocks:
                 chunk = self._block_to_chunk(
                     block, sheet, html_renderer, text_renderer
@@ -154,12 +332,33 @@ def _block_to_chunk(
             for coord in block.key_cells
         ]
 
+        # Cluster-02 invariant: the chunk's claimed A1 range must be a
+        # tight bbox over cells with content. Overclaiming inflates
+        # geometric recall for the parser but lies to downstream
+        # consumers (a UI citation that highlights an empty area).
+        # We clip but never widen — the renderer already only outputs
+        # cells inside `block.cell_range`, so a narrowed range still
+        # contains every cell that contributed to `render_text`.
+        tight = _tight_content_bbox(block, sheet)
+        if tight is not None:
+            r0, c0, r1, c1 = tight
+            chunk_range = CellRange(
+                top_left=CellCoord(row=r0, col=c0),
+                bottom_right=CellCoord(row=r1, col=c1),
+            )
+        else:
+            # Block has zero non-empty cells (styled-empty cells across
+            # XFD columns is a real shape on this corpus). Claim only the
+            # top-left cell so the chunk is honest about being empty.
+            tl = block.cell_range.top_left
+            chunk_range = CellRange(top_left=tl, bottom_right=tl)
+
         return ChunkDTO(
             sheet_name=block.sheet_name,
             block_type=block.block_type,
-            top_left_cell=block.cell_range.top_left.to_a1(),
-            bottom_right_cell=block.cell_range.bottom_right.to_a1(),
-            cell_range=block.cell_range,
+            top_left_cell=chunk_range.top_left.to_a1(),
+            bottom_right_cell=chunk_range.bottom_right.to_a1(),
+            cell_range=chunk_range,
             key_cells=key_cells,
             named_ranges=block.named_ranges,
             dependency_summary=dep_summary,

diff --git a/tests/test_chunker_range_tighten.py b/tests/test_chunker_range_tighten.py
@@ -0,0 +1,143 @@
+"""Range-tightening invariant for chunk emission.
+
+Cluster-02 closes when every chunk's claimed (top_left, bottom_right)
+range is the bounding box of cells with non-empty content within
+``block.cell_range``. Today the chunker copies ``block.cell_range``
+verbatim — when the segmenter hands it a block whose range is wider
+than the data (empty rows/cols at the edges), the chunk overclaims
+geometric coverage.
+
+These tests assert the invariant on real fixtures + a hand-built
+"data in the upper-left corner of a wider block" workbook.
+"""
+from __future__ import annotations
+
+import openpyxl
+
+from ks_xlsx_parser.api import parse_workbook
+from ks_xlsx_parser.models.common import addr_to_a1
+
+
+def _a1_to_coord(a1: str) -> tuple[int, int]:
+    """Inverse of addr_to_a1 — small helper for tests only."""
+    letters = "".join(c for c in a1 if c.isalpha())
+    digits = "".join(c for c in a1 if c.isdigit())
+    col = 0
+    for ch in letters.upper():
+        col = col * 26 + (ord(ch) - ord("A") + 1)
+    return int(digits), col
+
+
+def test_chunk_range_tight_around_actual_content(tmp_path):
+    """A block whose range covers A1:E20 but only has data in A1:C5
+    must emit a chunk that claims A1:C5 (or tighter), not A1:E20."""
+    p = tmp_path / "sparse.xlsx"
+    wb = openpyxl.Workbook()
+    ws = wb.active
+    ws.title = "S"
+    # Fill only the upper-left 5×3 region; rest of the block is empty.
+    ws["A1"] = "Name"
+    ws["B1"] = "Q1"
+    ws["C1"] = "Q2"
+    for i, name in enumerate(["Alpha", "Bravo", "Charlie", "Delta"], start=2):
+        ws.cell(row=i, column=1, value=name)
+        ws.cell(row=i, column=2, value=i * 10)
+        ws.cell(row=i, column=3, value=i * 20)
+    wb.save(p)
+
+    result = parse_workbook(path=str(p))
+    chunks_on_S = [c for c in result.chunks if c.sheet_name == "S"]
+    assert chunks_on_S, "no chunks emitted on S"
+    for c in chunks_on_S:
+        r1, c1 = _a1_to_coord(c.bottom_right_cell)
+        # Bottom-right of the claimed range must not exceed the data bbox.
+        assert r1 <= 5, (
+            f"chunk over-claims rows: {c.top_left_cell}:{c.bottom_right_cell} "
+            f"but data ends at row 5"
+        )
+        assert c1 <= 3, (
+            f"chunk over-claims cols: {c.top_left_cell}:{c.bottom_right_cell} "
+            f"but data ends at col C"
+        )
+
+
+def test_chunk_range_unchanged_when_block_is_already_tight(tmp_path):
+    """If block.cell_range already matches the data bbox, the chunk's
+    claimed range must be identical — no spurious narrowing."""
+    p = tmp_path / "dense.xlsx"
+    wb = openpyxl.Workbook()
+    ws = wb.active
+    ws.title = "S"
+    ws["A1"] = "h1"
+    ws["B1"] = "h2"
+    ws["A2"] = 1
+    ws["B2"] = 2
+    wb.save(p)
+
+    result = parse_workbook(path=str(p))
+    chunks_on_S = [c for c in result.chunks if c.sheet_name == "S"]
+    assert chunks_on_S
+    # The single dense block should claim A1:B2 exactly.
+    found_tight = any(
+        c.top_left_cell == "A1" and c.bottom_right_cell == "B2"
+        for c in chunks_on_S
+    )
+    assert found_tight, [
+        (c.top_left_cell, c.bottom_right_cell) for c in chunks_on_S
+    ]
+
+
+def test_chunk_range_invariant_holds_on_corpus_fixture():
+    """End-to-end on a real SpreadsheetBench input — every emitted
+    chunk's range must be a tight bbox over cells with content.
+    Uses one of the 50-sample sheets; skipped if corpus absent."""
+    import os
+    p = (
+        "data/corpora/spreadsheetbench/all_data_912_v0.1"
+        "/spreadsheet/54105/1_54105_input.xlsx"
+    )
+    if not os.path.exists(p):
+        import pytest
+        pytest.skip(f"corpus fixture not present: {p}")
+
+    result = parse_workbook(path=p)
+    import openpyxl
+    wb = openpyxl.load_workbook(p, data_only=True)
+
+    for c in result.chunks:
+        if not c.top_left_cell or not c.bottom_right_cell:
+            continue
+        ws = wb[c.sheet_name]
+        r0, c0 = _a1_to_coord(c.top_left_cell)
+        r1, c1 = _a1_to_coord(c.bottom_right_cell)
+        # The CORNER rows/cols of the claimed range must each contain
+        # some non-empty cell — otherwise the range is over-claiming.
+        def _row_has_content(r: int) -> bool:
+            for col in range(c0, c1 + 1):
+                v = ws.cell(row=r, column=col).value
+                if v is not None and str(v).strip():
+                    return True
+            return False
+
+        def _col_has_content(col: int) -> bool:
+            for r in range(r0, r1 + 1):
+                v = ws.cell(row=r, column=col).value
+                if v is not None and str(v).strip():
+                    return True
+            return False
+
+        # Allow chart anchors (single-cell, no real content) to pass.
+        if r0 == r1 and c0 == c1:
+            continue
+        assert _row_has_content(r0), (
+            f"top row of chunk {c.top_left_cell}:{c.bottom_right_cell} is empty"
+        )
+        assert _row_has_content(r1), (
+            f"bottom row of chunk {c.top_left_cell}:{c.bottom_right_cell} is empty"
+        )
+        assert _col_has_content(c0), (
+            f"left col of chunk {c.top_left_cell}:{c.bottom_right_cell} is empty"
+        )
+        assert _col_has_content(c1), (
+            f"right col of chunk {c.top_left_cell}:{c.bottom_right_cell} is empty"
+        )