Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions scripts/append_bench_history.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,10 +66,15 @@ def main(argv: list[str] | None = None) -> int:
"parser": args.parser,
"run": summary_path.parent.name,
"instances": metrics.get("instances"),
"in_scope_instances": metrics.get("in_scope_instances"),
"out_of_scope_instances": metrics.get("out_of_scope_instances"),
"recall_text@1": metrics.get("recall_text@1"),
"recall_text@3": metrics.get("recall_text@3"),
"recall_text@5": metrics.get("recall_text@5"),
"recall_geometric@5": metrics.get("recall_geometric@5"),
# In-scope numbers are the gate per the recall-90 roadmap.
"recall_text@5_in_scope": metrics.get("recall_text@5_in_scope"),
"recall_geometric@5_in_scope": metrics.get("recall_geometric@5_in_scope"),
"table_fragmentation_rate": metrics.get("table_fragmentation_rate"),
"mean_parse_ms": metrics.get("mean_parse_ms"),
"errors": metrics.get("errors"),
Expand All @@ -82,13 +87,14 @@ def main(argv: list[str] | None = None) -> int:

print(f"appended to {HISTORY.relative_to(ROOT)}:")
print(f" commit {row['commit']} recall_text@5={row['recall_text@5']} "
f"recall_text@1={row['recall_text@1']}")
f"in_scope={row['recall_text@5_in_scope']}")

# Show the trend if there's history to compare against.
rows = [json.loads(ln) for ln in HISTORY.read_text().splitlines() if ln.strip()]
if len(rows) >= 2:
prev, cur = rows[-2], rows[-1]
for k in ("recall_text@5", "recall_text@1"):
for k in ("recall_text@5", "recall_text@5_in_scope",
"recall_geometric@5", "recall_geometric@5_in_scope"):
p, c = prev.get(k), cur.get(k)
if isinstance(p, int | float) and isinstance(c, int | float):
delta = c - p
Expand Down
86 changes: 52 additions & 34 deletions scripts/enrich_failures.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,46 +178,64 @@ def enrich(run_dir: Path, corpus: Path, out_path: Path) -> None:
if answer_regions else answer_sheet)
answer_range_bbox = answer_regions[0][1] if answer_regions else None
n_input_cells_in_answer_range = 0
if (wb_d and answer_sheet_resolved and answer_range_bbox
and answer_sheet_resolved in wb_d.sheetnames):
try:
# When the dataset has no `answer_sheet` (561/912 instances) the
# answer_position is a bare range — fall back to the first
# worksheet, same as the harness scoring path. Without this fallback
# the flag is a false positive for ~470 of 912 instances.
if wb_d and answer_range_bbox:
ws = None
if answer_sheet_resolved and answer_sheet_resolved in wb_d.sheetnames:
ws = wb_d[answer_sheet_resolved]
r0, c0, r1, c1 = answer_range_bbox
for row in ws.iter_rows(min_row=r0, max_row=r1, min_col=c0,
max_col=c1, values_only=True):
for v in row:
if v is not None and str(v).strip():
n_input_cells_in_answer_range += 1
except Exception:
pass
elif wb_d.worksheets:
ws = wb_d.worksheets[0]
if ws is not None:
try:
r0, c0, r1, c1 = answer_range_bbox
for row in ws.iter_rows(min_row=r0, max_row=r1, min_col=c0,
max_col=c1, values_only=True):
for v in row:
if v is not None and str(v).strip():
n_input_cells_in_answer_range += 1
except Exception:
pass

gt_cell_raw = None
gt_cell_formula = None
gt_cell_data_only = None
n_workbook_cells_in_gt = 0
if wb_f and gt_sheet and gt_sheet in wb_f.sheetnames and gt_range_bbox:
ws_f = wb_f[gt_sheet]
ws_d = wb_d[gt_sheet]
r0, c0, r1, c1 = gt_range_bbox
# First cell only — enough to know "formula vs. value".
try:
tl_cell_f = ws_f.cell(row=r0, column=c0)
tl_cell_d = ws_d.cell(row=r0, column=c0)
gt_cell_raw = tl_cell_f.value
if isinstance(gt_cell_raw, str) and gt_cell_raw.startswith("="):
gt_cell_formula = gt_cell_raw
gt_cell_data_only = tl_cell_d.value
except Exception:
pass
# Count non-empty cells across the range
try:
for row in ws_d.iter_rows(min_row=r0, max_row=r1, min_col=c0,
max_col=c1, values_only=True):
for v in row:
if v is not None and str(v).strip():
n_workbook_cells_in_gt += 1
except Exception:
pass
# Same first-worksheet fallback as the answer-range block above.
# Without it, every instance with no explicit `answer_sheet` (561 / 912
# in the v0.1 corpus) trips the `gt_range_empty_in_workbook` flag
# spuriously.
if wb_f and gt_range_bbox:
ws_f = ws_d = None
if gt_sheet and gt_sheet in wb_f.sheetnames:
ws_f = wb_f[gt_sheet]
ws_d = wb_d[gt_sheet]
elif wb_f.worksheets:
ws_f = wb_f.worksheets[0]
ws_d = wb_d.worksheets[0]
if ws_f is not None and ws_d is not None:
r0, c0, r1, c1 = gt_range_bbox
# First cell only — enough to know "formula vs. value".
try:
tl_cell_f = ws_f.cell(row=r0, column=c0)
tl_cell_d = ws_d.cell(row=r0, column=c0)
gt_cell_raw = tl_cell_f.value
if isinstance(gt_cell_raw, str) and gt_cell_raw.startswith("="):
gt_cell_formula = gt_cell_raw
gt_cell_data_only = tl_cell_d.value
except Exception:
pass
# Count non-empty cells across the range
try:
for row in ws_d.iter_rows(min_row=r0, max_row=r1, min_col=c0,
max_col=c1, values_only=True):
for v in row:
if v is not None and str(v).strip():
n_workbook_cells_in_gt += 1
except Exception:
pass

chunks_on_gt = [c for c in chunks if gt_sheet and c.sheet_name == gt_sheet]
gt_chunk_bbox = chunk_bbox(chunks_on_gt)
Expand Down
Loading
Loading