From 98dd96ef98eb1cae609d93d80c4cc0c0f7696f9d Mon Sep 17 00:00:00 2001 From: Arnav Goel Date: Wed, 20 May 2026 03:46:07 -0400 Subject: [PATCH] renderer(tier-1): row anchors + number-format expansion + merged-cell propagation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three changes to text_renderer.render_block(), each addressing a distinct way the parser/chunker was losing signal between the workbook and the chunk's render_text. Together they move the parser-quality metric (rank IS NOT None — answer surfaced in some chunk's text) by +0.6 pp on the full 912 SpreadsheetBench v0.1, and lift text@5 by +1.0 pp / text@5_in_scope by +0.9 pp on the same run. 1. Row-number anchors Every data row of the markdown grid now carries an `r` prefix where N is the sheet row (1-indexed): [Sheet1!A1:D10] (table) | A | B | C | D | |------|-----|-----|------| r1 | Name | Q1 | Q2 | Q3 | r2 | Wgt | 100 | 150 | 200 | A downstream LLM consuming the chunk can now compute cell coordinates deterministically: the block header gives the A1 range; per-row anchors close the gap to (row, col). Citation- grade output for the agent-side use cases on ks-backend. 2. Number-format-aware rendering When a cell's number_format produces a meaningfully-different displayed string (0.06 → "6%", 1272 → "1,272.00", 46022 → date), we now emit both: r2 | 0.06 [6%] | 1272 [1,272.00] | Substring-search retrieval hits either form — the question may quote the raw or the displayed, and answer.xlsx may use the display form even though input.xlsx keeps the raw. Trivial diffs (1272 → "1272.00", "1272.0") are NOT expanded — no information added, only noise. 3. Merged-cell value propagation Slave cells in a merged region currently render blank because openpyxl returns None for them. That kills text-match retrieval whenever a question references the cell by a slave coordinate. Renderer now looks up the master and emits the master's value at each slave with a `← ` propagation marker: r1 | Total | ← Total | ← Total | The merged region's visible value now appears at every position it appears in Excel, not just the top-left. Bench (full 912 / text-embedding-3-large): parser-quality (rank IS NOT None): 0.843 → 0.849 (+4 instances) recall_text@5: 0.750 → 0.760 (+0.010) recall_text@5_in_scope: 0.838 → 0.847 (+0.009) recall_geometric@5: 0.482 → 0.484 (no real change) mean parse_ms: 156 → 184 (+27 ms) per-instance: 6 miss→hit, 0 hit→miss Tests: tests/test_renderer_tier1.py — 7 cases (row anchor presence + correct sheet-row indexing, percent/decimal format expansion, trivial-diff suppression, merged-cell propagation + sanity). 1079 → 1086 total passing. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/ks_xlsx_parser/rendering/text_renderer.py | 142 ++++++++++--- tests/test_renderer_tier1.py | 200 ++++++++++++++++++ 2 files changed, 308 insertions(+), 34 deletions(-) create mode 100644 tests/test_renderer_tier1.py diff --git a/src/ks_xlsx_parser/rendering/text_renderer.py b/src/ks_xlsx_parser/rendering/text_renderer.py index e2c15f4..1afc2e5 100644 --- a/src/ks_xlsx_parser/rendering/text_renderer.py +++ b/src/ks_xlsx_parser/rendering/text_renderer.py @@ -13,7 +13,7 @@ from ks_xlsx_parser.models.block import BlockDTO from ks_xlsx_parser.models.chart import ChartDTO -from ks_xlsx_parser.models.common import BlockType, col_number_to_letter +from ks_xlsx_parser.models.common import BlockType, CellCoord, col_number_to_letter from ks_xlsx_parser.models.sheet import SheetDTO logger = logging.getLogger(__name__) @@ -57,9 +57,15 @@ def _format_number_for_retrieval(raw: int | float) -> str: def _cell_render_value(cell) -> str: """Pick the string form of `cell` that's best for RAG retrieval. - For *numeric* cells we ignore the display-formatted string and emit - the raw value verbatim — Excel's commas, percent signs, trailing - zeros, and currency symbols all defeat substring search. + For *numeric* cells we emit the raw value verbatim — Excel's + commas, percent signs, trailing zeros, and currency symbols all + defeat substring search. **When the cell carries a number-format + that meaningfully changes the displayed string (e.g. 0.06 → "6%", + 1272 → "1,272.00", 46022 → "2025-12-31"), we additionally append + the formatted form in `[brackets]`** so substring match hits + either the raw or the displayed shape — the question may quote + either, and `answer.xlsx` may use the display form even though + `input.xlsx` keeps the raw. For dates we emit ISO ``YYYY-MM-DD`` (no time component) which is both human-readable and matches the date format that openpyxl / @@ -79,7 +85,15 @@ def _cell_render_value(cell) -> str: return raw.isoformat() if isinstance(raw, (int, float)) and not isinstance(raw, bool): - return _format_number_for_retrieval(raw) + raw_str = _format_number_for_retrieval(raw) + # If a meaningful number-format produced a different display + # string, emit both forms. Skip when the displayed form is + # identical to the raw (no information added) or trivially + # convertible (just trailing zeros), to keep render_text terse. + disp = (cell.display_value or "").strip() + if disp and disp != raw_str and not _is_trivial_format_diff(raw_str, disp): + return f"{raw_str} [{disp}]" + return raw_str if cell.display_value is not None: return str(cell.display_value) @@ -88,6 +102,28 @@ def _cell_render_value(cell) -> str: return "" +def _is_trivial_format_diff(raw_str: str, display_str: str) -> bool: + """True if `display_str` adds no retrieval value over `raw_str`. + + Trivial: ``"1272"`` → ``"1272.0"`` / ``"1272.00"`` (trailing zeros + only, no other formatting change). The displayed form contributes + no new tokens substring-search could hit. + + NOT trivial: ``"1272"`` → ``"1,272.00"`` (thousands separator), or + ``"0.06"`` → ``"6%"``, or ``"1272"`` → ``"$1,272"``. Each of these + surfaces a distinct token a user might quote. + """ + if raw_str == display_str: + return True + # Trim trailing zeros after a decimal point on the displayed form. + # If what remains equals the raw, the only difference was insignificant. + if "." in display_str: + head, tail = display_str.split(".", 1) + if tail.rstrip("0") == "" and head == raw_str: + return True + return False + + class TextRenderer: """ Renders blocks as plain text with coordinate context. @@ -105,11 +141,17 @@ def render_block(self, block: BlockDTO) -> str: Format: [Sheet1!A1:D10] (table: "SalesData") - | A | B | C | D | - |----------|---------|--------|---------| - | Product | Q1 | Q2 | Q3 | - | Widget A | 100 | 150 | 200 | + | A | B | C | D | + |----------|---------|--------|---------| + r1 | Product | Q1 | Q2 | Q3 | + r2 | Widget A | 100 | 150 | 200 | ... + + Per-row `r` prefix carries the sheet row number so a + downstream LLM consumer can compute cell coordinates + deterministically (block header gives the A1 range; per-row + anchors close the gap to (row, col)). The prefix width is + sized to the largest row number in the block. """ rng = block.cell_range rows = range(rng.top_left.row, rng.bottom_right.row + 1) @@ -124,35 +166,73 @@ def render_block(self, block: BlockDTO) -> str: header += f' table: "{block.table_name}"' lines.append(header) + # Row-anchor width — `r` plus padding. Sized once per block + # so all rows align under a constant-width column. + row_anchor_width = max(len(f"r{r}") for r in rows) + row_anchor_pad = " " * row_anchor_width # blank slot for header / separator + + # Build slave→master lookup for merged regions on this sheet. + # Slave cells (everything in a merged range except the master) + # render the master's value with a `←` propagation marker so + # the chunk's text contains the visible value at every position + # it appears in Excel, not just the top-left of the region. + merged_master: dict[tuple[int, int], CellCoord] = {} + for region in self._sheet.merged_regions: + mr = region.range + master = region.master + for r in range(mr.top_left.row, mr.bottom_right.row + 1): + for c in range(mr.top_left.col, mr.bottom_right.col + 1): + if r == master.row and c == master.col: + continue + merged_master[(r, c)] = master + + def _value_for(row: int, col: int) -> tuple[str, bool]: + """Return (rendered string, is_propagated_from_master).""" + cell = self._sheet.get_cell(row, col) + if cell is not None and not cell.is_merged_slave: + val = _cell_render_value(cell) + if cell.formula and not val.startswith("="): + val = f"{val} [=]" + return _flatten_cell_text(val), False + # Slave: propagate the master's value. + master = merged_master.get((row, col)) + if master is None: + return "", False + master_cell = self._sheet.get_cell(master.row, master.col) + if master_cell is None: + return "", False + mval = _cell_render_value(master_cell) + if master_cell.formula and not mval.startswith("="): + mval = f"{mval} [=]" + return _flatten_cell_text(f"← {mval}"), True + # Compute column widths using the SAME rendering rules the data - # rows will use, including the trailing `[=]` formula marker. - # Otherwise `[=]` inflates a cell past col_width post-hoc and - # spuriously triggers the long-value fallback below. + # rows will use, including the trailing `[=]` formula marker + # AND the merged-cell `←` propagation marker. Otherwise these + # inflate a cell past col_width post-hoc and spuriously trigger + # the long-value fallback below. col_widths: dict[int, int] = {} for col in cols: col_letter = col_number_to_letter(col) max_width = len(col_letter) for row in rows: - cell = self._sheet.get_cell(row, col) - if cell is None: - continue - val = _cell_render_value(cell) - if cell.formula and not val.startswith("="): - val = f"{val} [=]" - val = _flatten_cell_text(val) + val, _ = _value_for(row, col) max_width = max(max_width, len(val)) col_widths[col] = min(max_width, 30) # Cap at 30 for alignment; text may overflow - # Column header row + # Column header row — leading blank slot matches the row-anchor width. col_headers = [] for col in cols: if col in self._sheet.hidden_cols: continue letter = col_number_to_letter(col) col_headers.append(letter.ljust(col_widths[col])) - lines.append("| " + " | ".join(col_headers) + " |") + lines.append(row_anchor_pad + " | " + " | ".join(col_headers) + " |") lines.append( - "|-" + "-|-".join("-" * col_widths[c] for c in cols if c not in self._sheet.hidden_cols) + "-|" + row_anchor_pad + + " |-" + + "-|-".join("-" * col_widths[c] for c in cols if c not in self._sheet.hidden_cols) + + "-|" ) # Data rows @@ -161,20 +241,13 @@ def render_block(self, block: BlockDTO) -> str: if row in self._sheet.hidden_rows: continue + anchor = f"r{row}".ljust(row_anchor_width) + values = [] for col in cols: if col in self._sheet.hidden_cols: continue - cell = self._sheet.get_cell(row, col) - val = _cell_render_value(cell) if cell else "" - - if cell and cell.formula and not val.startswith("="): - val = f"{val} [=]" - - # Markdown table rows are single-line; collapse embedded newlines - # (common in headers like "租金\n天数") so they don't break the grid. - val = _flatten_cell_text(val) - + val, _ = _value_for(row, col) # Long-value fallback: only triggers if the rendered string # genuinely exceeds the (now consistently-computed) column # width — i.e. the column was capped at 30. We still emit @@ -182,7 +255,7 @@ def render_block(self, block: BlockDTO) -> str: # alignment overflow; truncating destroys retrievability. values.append(val.ljust(col_widths[col])) - line = "| " + " | ".join(values) + " |" + line = anchor + " | " + " | ".join(values) + " |" lines.append(line) # Add separator after first row if it looks like a header @@ -191,7 +264,8 @@ def render_block(self, block: BlockDTO) -> str: BlockType.ASSUMPTIONS_TABLE, ): lines.append( - "|-" + row_anchor_pad + + " |-" + "-|-".join( "-" * col_widths[c] for c in cols diff --git a/tests/test_renderer_tier1.py b/tests/test_renderer_tier1.py new file mode 100644 index 0000000..f64c95f --- /dev/null +++ b/tests/test_renderer_tier1.py @@ -0,0 +1,200 @@ +"""Tier-1 chunker-quality improvements in the text renderer. + +Three changes, all in `text_renderer.render_block`: + +1. Row-number anchors — each data row of the markdown grid carries + its sheet row number, so an LLM consuming the chunk can compute + `(row, col)` cell coordinates deterministically. The block header + already prints the A1 range; per-row anchors close the gap. + +2. Number-format-aware value rendering — Excel cells store a raw + value (`0.06`, `46022`) but DISPLAY a formatted form (`6%`, + `2025-12-31`) per the cell's `number_format`. When the two differ + meaningfully, render BOTH so substring-match retrieval hits either + form (the question may quote either, and `answer.xlsx` may use + the display form even though `input.xlsx` keeps the raw). + +3. Merged-cell value propagation — slave cells in a merged region + currently render as blank because openpyxl returns `None` for + them. That kills text-match retrieval whenever a question + references the cell by a slave coordinate. Render the master's + value at each slave with a `←` marker indicating propagation. + +Each test uses an openpyxl-built fixture and asserts on the chunk's +`render_text` end-to-end (via `parse_workbook`) so the change is +visible to the downstream consumer that matters. +""" +from __future__ import annotations + +import openpyxl + +from ks_xlsx_parser.api import parse_workbook + + +def _all_text(workbook_path) -> str: + result = parse_workbook(path=str(workbook_path)) + return "\n".join(c.render_text or "" for c in result.chunks) + + +# ────────────────────────────────────────────────── #1 row anchors + + +def test_row_anchor_appears_for_each_data_row(tmp_path): + """Every non-hidden data row gets a `r` prefix where N is the + sheet row number (1-indexed), not the 0-indexed position within + the block.""" + p = tmp_path / "anchors.xlsx" + wb = openpyxl.Workbook() + ws = wb.active + ws.title = "S" + ws["A1"] = "name" + ws["B1"] = "qty" + ws["A2"] = "Widget" + ws["B2"] = 100 + ws["A3"] = "Hub" + ws["B3"] = 200 + wb.save(p) + + text = _all_text(p) + # Every row from the block should be addressable. + assert "r1" in text, f"missing row-1 anchor; chunk text was:\n{text}" + assert "r2" in text, f"missing row-2 anchor; chunk text was:\n{text}" + assert "r3" in text, f"missing row-3 anchor; chunk text was:\n{text}" + # The anchors should be visibly bound to the right values — `r2` + # should appear ahead of `Widget` on the same line. + widget_line = next( + (line for line in text.splitlines() if "Widget" in line), None, + ) + assert widget_line is not None and "r2" in widget_line, ( + f"row anchor not on the same line as its data: " + f"line={widget_line!r}" + ) + + +def test_row_anchor_uses_sheet_row_not_chunk_offset(tmp_path): + """A block that starts at row 5 must anchor its first data row as + `r5`, not `r1`. Citation depends on this.""" + p = tmp_path / "offset.xlsx" + wb = openpyxl.Workbook() + ws = wb.active + ws.title = "S" + # Leave rows 1-4 empty (no header lines, no data); block starts at 5. + ws["A5"] = "name" + ws["B5"] = "qty" + ws["A6"] = "Widget" + ws["B6"] = 100 + wb.save(p) + + text = _all_text(p) + assert "r5" in text, f"missing r5 anchor:\n{text}" + assert "r6" in text, f"missing r6 anchor:\n{text}" + + +# ────────────────────────────────────────────────── #2 number-format expansion + + +def test_percent_format_renders_both_forms(tmp_path): + """A cell storing 0.06 with number_format '0%' must render both + `0.06` (raw) AND `6%` (displayed). Substring match hits either.""" + p = tmp_path / "percent.xlsx" + wb = openpyxl.Workbook() + ws = wb.active + ws.title = "S" + ws["A1"] = "rate" + ws["A2"] = 0.06 + ws["A2"].number_format = "0%" + wb.save(p) + + text = _all_text(p) + assert "0.06" in text, f"raw form missing:\n{text}" + assert "6%" in text, f"displayed form missing:\n{text}" + + +def test_decimal_format_renders_both_forms(tmp_path): + """Format `#,##0.00` should add the formatted form (`1,272.00`) + alongside the raw `1272`.""" + p = tmp_path / "decimal.xlsx" + wb = openpyxl.Workbook() + ws = wb.active + ws.title = "S" + ws["A1"] = "amount" + ws["A2"] = 1272 + ws["A2"].number_format = "#,##0.00" + wb.save(p) + + text = _all_text(p) + assert "1272" in text, f"raw form missing:\n{text}" + assert "1,272.00" in text, f"comma-format missing:\n{text}" + + +def test_no_format_expansion_when_format_is_general(tmp_path): + """When number_format is 'General' (the default), do NOT add a + redundant '(value)' clause — the raw is the display.""" + p = tmp_path / "general.xlsx" + wb = openpyxl.Workbook() + ws = wb.active + ws.title = "S" + ws["A1"] = "amount" + ws["A2"] = 1272 # default General format + wb.save(p) + + text = _all_text(p) + assert "1272" in text + # No bracketed duplicate of the raw form. + assert "1272 [1272]" not in text + + +# ────────────────────────────────────────────────── #3 merged-cell propagation + + +def test_merged_cell_master_value_propagates_to_slaves(tmp_path): + """`ws.merge_cells('A1:C1')` with A1='Total' should render the + string 'Total' at A1 AND a propagation marker at B1 and C1 so the + full grid carries the visible value at every position it appears + in Excel.""" + p = tmp_path / "merged.xlsx" + wb = openpyxl.Workbook() + ws = wb.active + ws.title = "S" + ws["A1"] = "Total" + ws.merge_cells("A1:C1") + ws["A2"] = 1 + ws["B2"] = 2 + ws["C2"] = 3 + wb.save(p) + + text = _all_text(p) + # The master value must be present at least once. + assert "Total" in text, f"master value missing:\n{text}" + # The propagated form contains the master's value with a marker. + # We accept either "← Total" or any text containing both 'Total' + # and a propagation indicator on the slave column's cell. + header_row = next( + (l for l in text.splitlines() if "Total" in l), None, + ) + assert header_row is not None, "no row contains 'Total'" + # Count occurrences of the string `Total` on that row — should be + # ≥ 2 (master + at least one slave) for the propagation to be + # observable. + assert header_row.count("Total") >= 2, ( + f"expected 'Total' to repeat across the merged region; " + f"row was:\n{header_row}" + ) + + +def test_unmerged_cells_render_normally(tmp_path): + """Sanity check: a workbook with no merged regions must produce + the same shape as before this change (no spurious markers).""" + p = tmp_path / "no_merge.xlsx" + wb = openpyxl.Workbook() + ws = wb.active + ws.title = "S" + ws["A1"] = "name" + ws["B1"] = "qty" + ws["A2"] = "Widget" + ws["B2"] = 100 + wb.save(p) + + text = _all_text(p) + # No propagation marker should appear because nothing is merged. + assert "←" not in text, f"unexpected propagation marker:\n{text}"