From 5ee0efa6ec61a24c6488cdb51362b327da409b18 Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@renci.org>
Date: Mon, 4 May 2026 15:25:38 -0400
Subject: [PATCH 1/2] Add csv-to-babeltests CLI for converting CSV assets into
 BabelTests YAML.

Translator collaborators sometimes circulate spreadsheets pairing a CURIE
column with a label, equivalent CURIE, or Biolink type. This adds a click
CLI that ingests such CSVs and emits a paste-ready YAML babel_tests block
for a GitHub issue, optionally validating each row against a NodeNorm
target from tests/targets.ini and reporting failures on stderr.

The tool reuses ASSERTION_HANDLERS and CachedNodeNorm directly so the
emitted YAML is guaranteed to match what GitHubIssuesTestCases parses,
and per-assertion semantics never drift between the two code paths.

Wires hatchling as the build backend (packaging src/ as-is so existing
`from src.babel_validation.X` imports keep working) so the
csv-to-babeltests console script can be exposed via [project.scripts].

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 pyproject.toml                                |  14 +
 src/babel_validation/tools/__init__.py        |   0
 .../tools/csv_to_babeltests.py                | 428 ++++++++++++++++++
 tests/tools/test_csv_to_babeltests.py         | 227 ++++++++++
 uv.lock                                       |   6 +-
 5 files changed, 674 insertions(+), 1 deletion(-)
 create mode 100644 src/babel_validation/tools/__init__.py
 create mode 100644 src/babel_validation/tools/csv_to_babeltests.py
 create mode 100644 tests/tools/test_csv_to_babeltests.py

diff --git a/pyproject.toml b/pyproject.toml
index 3cc1b9a..0f19386 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -7,6 +7,8 @@ readme = "README.md"
 requires-python = ">=3.11"
 dependencies = [
     "black>=25.9.0",
+    "click>=8.1",
+    "pyyaml>=6.0",
     "requests>=2.32.5",
     "tqdm>=4.67.1",
     "filelock",
@@ -22,6 +24,18 @@ dependencies = [
 [project.urls]
 Repository = "https://github.com/TranslatorSRI/babel-validation"
 
+[project.scripts]
+csv-to-babeltests = "src.babel_validation.tools.csv_to_babeltests:main"
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[tool.hatch.build.targets.wheel]
+# Package the entire `src/` directory so existing imports
+# (`from src.babel_validation.X import Y`) continue to work after install.
+packages = ["src"]
+
 [tool.pytest.ini_options]
 timeout = 300
 markers = [
diff --git a/src/babel_validation/tools/__init__.py b/src/babel_validation/tools/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/babel_validation/tools/csv_to_babeltests.py b/src/babel_validation/tools/csv_to_babeltests.py
new file mode 100644
index 0000000..668456d
--- /dev/null
+++ b/src/babel_validation/tools/csv_to_babeltests.py
@@ -0,0 +1,428 @@
+"""csv-to-babeltests
+====================
+
+Convert a CSV of (CURIE, label/type/equivalent-CURIE) rows into a YAML
+``babel_tests:`` block suitable for pasting into a GitHub issue, optionally
+validating each row against a NodeNorm endpoint defined in
+``tests/targets.ini``.
+
+The emitted YAML is the same format consumed by
+``GitHubIssuesTestCases.get_test_issues_from_issue`` — assertion handlers and
+YAML schema are reused, not duplicated.
+
+Run:
+    uv run csv-to-babeltests INPUT.csv --curie-column OutputID \\
+        --label-column "Expected Result / Suggested Comparator" --target dev
+"""
+
+from __future__ import annotations
+
+import configparser
+import csv
+import io
+import sys
+from collections import defaultdict
+from dataclasses import dataclass, field
+from pathlib import Path
+
+import click
+import yaml
+
+from src.babel_validation.assertions import ASSERTION_HANDLERS
+from src.babel_validation.core.testrow import TestStatus
+from src.babel_validation.services.nodenorm import CachedNodeNorm
+
+
+# --- YAML emission ---------------------------------------------------------
+
+class _FlowList(list):
+    """List subclass that yaml.safe_dump emits in inline flow style."""
+
+
+def _represent_flow_list(dumper, data):
+    return dumper.represent_sequence(
+        "tag:yaml.org,2002:seq", data, flow_style=True
+    )
+
+
+yaml.SafeDumper.add_representer(_FlowList, _represent_flow_list)
+
+
+# --- Data structures -------------------------------------------------------
+
+@dataclass
+class BlockEntry:
+    """One assertion invocation: a param_set with provenance back to a CSV row."""
+    row_idx: int           # 1-based; header is row 1, first data row is row 2.
+    param_set: list[str]
+
+
+@dataclass
+class ValidationResult:
+    assertion: str
+    row_idx: int
+    param_set: list[str]
+    status: TestStatus
+    messages: list[str] = field(default_factory=list)
+
+
+# --- Pure functions (testable without network) -----------------------------
+
+def read_csv(path: Path | str, delimiter: str | None = None) -> list[dict[str, str]]:
+    """Read a CSV file (or '-' for stdin) into a list of dict rows.
+
+    If ``delimiter`` is None we let csv.Sniffer guess from the first 4KiB,
+    falling back to ',' on failure.
+    """
+    if str(path) == "-":
+        text = sys.stdin.read()
+    else:
+        text = Path(path).read_text(encoding="utf-8-sig")  # strip any BOM
+
+    if delimiter is None:
+        try:
+            delimiter = csv.Sniffer().sniff(text[:4096], delimiters=",\t;|").delimiter
+        except csv.Error:
+            delimiter = ","
+
+    reader = csv.DictReader(io.StringIO(text), delimiter=delimiter)
+    return list(reader)
+
+
+def build_blocks(
+    rows: list[dict[str, str]],
+    *,
+    curie_column: str,
+    label_column: str | None,
+    type_column: str | None,
+    equivalent_curie_column: str | None,
+    emit_resolves: bool,
+    dedupe: bool,
+    skip_empty: bool,
+) -> tuple[dict[str, list[BlockEntry]], list[str]]:
+    """Turn CSV rows into ``{assertion_name: [BlockEntry, ...]}``.
+
+    Returns ``(blocks, warnings)``. Warnings is a list of human-readable
+    strings the caller can dump to stderr (e.g. "row 17: empty OutputID").
+    """
+    blocks: dict[str, list[BlockEntry]] = defaultdict(list)
+    warnings: list[str] = []
+    seen: dict[str, set[tuple[str, ...]]] = defaultdict(set)
+
+    def add(assertion: str, param_set: list[str], row_idx: int) -> None:
+        key = tuple(param_set)
+        if dedupe and key in seen[assertion]:
+            return
+        seen[assertion].add(key)
+        blocks[assertion].append(BlockEntry(row_idx=row_idx, param_set=param_set))
+
+    for offset, row in enumerate(rows):
+        row_idx = offset + 2  # match what spreadsheet UIs show: header is row 1.
+
+        curie = (row.get(curie_column) or "").strip()
+        if not curie:
+            warnings.append(f"row {row_idx}: empty {curie_column!r} — skipping")
+            continue
+
+        if label_column is not None:
+            label = (row.get(label_column) or "").strip()
+            if not label and skip_empty:
+                warnings.append(f"row {row_idx}: empty {label_column!r} — skipping HasLabel")
+            else:
+                add("HasLabel", [curie, label], row_idx)
+
+        if type_column is not None:
+            biolink_type = (row.get(type_column) or "").strip()
+            if not biolink_type and skip_empty:
+                warnings.append(f"row {row_idx}: empty {type_column!r} — skipping ResolvesWithType")
+            else:
+                add("ResolvesWithType", [biolink_type, curie], row_idx)
+
+        if equivalent_curie_column is not None:
+            equiv = (row.get(equivalent_curie_column) or "").strip()
+            if not equiv and skip_empty:
+                warnings.append(f"row {row_idx}: empty {equivalent_curie_column!r} — skipping ResolvesWith")
+            else:
+                add("ResolvesWith", [curie, equiv], row_idx)
+
+        if emit_resolves:
+            add("Resolves", [curie], row_idx)
+
+    return dict(blocks), warnings
+
+
+def emit_yaml(
+    blocks: dict[str, list[BlockEntry]],
+    *,
+    fence: bool = True,
+    header: str | None = None,
+) -> str:
+    """Render a ``{assertion_name: [BlockEntry, ...]}`` map as a YAML block.
+
+    Each param_set is emitted as an inline flow list so the output reads like
+    the existing examples in ``assertions/nodenorm.py``::
+
+        babel_tests:
+          HasLabel:
+          - [CHEBI:15365, aspirin]
+
+    A round-trip self-check via ``yaml.safe_load`` ensures we never emit
+    something the GitHub-issue parser would reject.
+    """
+    # Match the convention used in src/babel_validation/assertions/nodenorm.py:
+    # single-element param_sets are emitted as bare strings ("- CHEBI:15365"),
+    # multi-element ones as inline flow lists ("- [CHEBI:15365, aspirin]").
+    # Both parse to the same param_set via the GitHub issue loader.
+    data = {
+        "babel_tests": {
+            assertion: [
+                e.param_set[0] if len(e.param_set) == 1 else _FlowList(e.param_set)
+                for e in entries
+            ]
+            for assertion, entries in blocks.items()
+        }
+    }
+    body = yaml.safe_dump(
+        data,
+        sort_keys=False,
+        default_flow_style=False,
+        allow_unicode=True,
+        width=10_000,
+    )
+
+    # Round-trip self-check; raises if our output isn't loadable.
+    yaml.safe_load(body)
+
+    parts: list[str] = []
+    if header:
+        parts.append(f"# {header}")
+    if fence:
+        parts.append("```yaml")
+    parts.append(body.rstrip())
+    if fence:
+        parts.append("```")
+    return "\n".join(parts) + "\n"
+
+
+# --- Validation ------------------------------------------------------------
+
+def validate_blocks(
+    blocks: dict[str, list[BlockEntry]],
+    nodenorm: CachedNodeNorm,
+) -> list[ValidationResult]:
+    """Run each (assertion, param_set) through the matching ASSERTION_HANDLER.
+
+    Param_sets are evaluated one at a time so each ``TestResult`` can be
+    attributed back to its source CSV row. ``CachedNodeNorm`` deduplicates
+    network calls per CURIE, so the per-row loop is no slower than batching.
+    """
+    # Pre-warm the cache with every CURIE we'll need across all blocks.
+    all_curies: set[str] = set()
+    for assertion, entries in blocks.items():
+        handler = ASSERTION_HANDLERS[assertion.lower()]
+        for entry in entries:
+            all_curies.update(handler.curie_params(entry.param_set))
+    if all_curies:
+        nodenorm.normalize_curies(list(all_curies))
+
+    results: list[ValidationResult] = []
+    for assertion, entries in blocks.items():
+        handler = ASSERTION_HANDLERS[assertion.lower()]
+        for entry in entries:
+            test_results = list(
+                handler.test_with_nodenorm(
+                    [entry.param_set], nodenorm,
+                    label=f"row {entry.row_idx}",
+                )
+            )
+            if test_results and all(r.status == TestStatus.Passed for r in test_results):
+                status = TestStatus.Passed
+                messages: list[str] = []
+            else:
+                status = TestStatus.Failed
+                messages = [r.message for r in test_results
+                            if r.status != TestStatus.Passed] or ["no result"]
+            results.append(ValidationResult(
+                assertion=assertion,
+                row_idx=entry.row_idx,
+                param_set=entry.param_set,
+                status=status,
+                messages=messages,
+            ))
+    return results
+
+
+def format_report(
+    results: list[ValidationResult],
+    target_name: str,
+    nodenorm_url: str,
+) -> str:
+    """Human-readable validation summary, suitable for stderr."""
+    by_assertion: dict[str, list[ValidationResult]] = defaultdict(list)
+    for r in results:
+        by_assertion[r.assertion].append(r)
+
+    lines = [f"Validation against target {target_name!r} ({nodenorm_url}):"]
+    for assertion, rows in by_assertion.items():
+        passed = sum(1 for r in rows if r.status == TestStatus.Passed)
+        failed = sum(1 for r in rows if r.status == TestStatus.Failed)
+        lines.append(f"  {assertion}: {passed} passed, {failed} failed.")
+        for r in rows:
+            if r.status == TestStatus.Failed:
+                params_str = ", ".join(r.param_set)
+                lines.append(
+                    f"    FAIL  row {r.row_idx}  [{params_str}]  →  {r.messages[0]}"
+                )
+    return "\n".join(lines) + "\n"
+
+
+# --- targets.ini -----------------------------------------------------------
+
+def _default_targets_ini() -> Path:
+    """Locate tests/targets.ini relative to this file's repo."""
+    # src/babel_validation/tools/csv_to_babeltests.py → ../../../tests/targets.ini
+    return Path(__file__).resolve().parents[3] / "tests" / "targets.ini"
+
+
+def load_nodenorm_url(target_name: str, targets_ini_path: Path) -> str:
+    """Look up ``NodeNormURL`` for ``target_name`` in ``targets_ini_path``."""
+    if not targets_ini_path.is_file():
+        raise click.ClickException(f"targets.ini not found at {targets_ini_path}")
+    cp = configparser.ConfigParser()
+    cp.read(targets_ini_path, encoding="utf-8")
+    if target_name not in cp:
+        raise click.ClickException(
+            f"target {target_name!r} not found in {targets_ini_path}; "
+            f"available: {', '.join(cp.sections())}"
+        )
+    section = cp[target_name]
+    if "NodeNormURL" not in section:
+        raise click.ClickException(
+            f"target {target_name!r} in {targets_ini_path} has no NodeNormURL"
+        )
+    return section["NodeNormURL"]
+
+
+# --- CLI -------------------------------------------------------------------
+
+@click.command(context_settings={"help_option_names": ["-h", "--help"]})
+@click.argument(
+    "input_csv",
+    type=click.Path(exists=True, dir_okay=False, allow_dash=True, path_type=Path),
+)
+@click.option(
+    "--curie-column", required=True,
+    help="Column name containing the primary CURIE.",
+)
+@click.option(
+    "--label-column", default=None,
+    help="Column with the expected label → emits a HasLabel block.",
+)
+@click.option(
+    "--type-column", default=None,
+    help="Column with a Biolink type (e.g. 'biolink:SmallMolecule') → emits a ResolvesWithType block.",
+)
+@click.option(
+    "--equivalent-curie-column", default=None,
+    help="Column with a CURIE that should merge to the same canonical id → emits a ResolvesWith block.",
+)
+@click.option(
+    "--resolves/--no-resolves", "emit_resolves_flag", default=None,
+    help=("Force-emit (or suppress) a Resolves block. Default: emit a Resolves "
+          "block only when no other assertion column was given."),
+)
+@click.option("--dedupe", is_flag=True, default=False,
+              help="Drop duplicate param_sets within each assertion block.")
+@click.option("--skip-empty/--no-skip-empty", default=True,
+              help="Skip rows where the assertion column is blank (with a stderr warning).")
+@click.option("--delimiter", default=None,
+              help="CSV delimiter (default: auto-detect via csv.Sniffer; falls back to comma).")
+@click.option("--target", default=None,
+              help="Target name from targets.ini to validate against (e.g. dev, prod, ci).")
+@click.option(
+    "--targets-ini",
+    type=click.Path(exists=False, dir_okay=False, path_type=Path),
+    default=None,
+    help="Override path to targets.ini (default: tests/targets.ini in the repo).",
+)
+@click.option("--fence/--no-fence", default=True,
+              help="Wrap output in ```yaml … ``` Markdown fences (default: on).")
+@click.option("--header", default=None,
+              help="Optional comment line emitted above the YAML block "
+                   "(useful for recording provenance).")
+def main(
+    input_csv: Path,
+    curie_column: str,
+    label_column: str | None,
+    type_column: str | None,
+    equivalent_curie_column: str | None,
+    emit_resolves_flag: bool | None,
+    dedupe: bool,
+    skip_empty: bool,
+    delimiter: str | None,
+    target: str | None,
+    targets_ini: Path | None,
+    fence: bool,
+    header: str | None,
+) -> None:
+    """Convert INPUT_CSV into a BabelTests YAML block on stdout."""
+
+    # Default behavior for --resolves: emit Resolves only when the user
+    # didn't ask for any of the other assertion blocks.
+    if emit_resolves_flag is None:
+        emit_resolves = not (label_column or type_column or equivalent_curie_column)
+    else:
+        emit_resolves = emit_resolves_flag
+
+    rows = read_csv(input_csv, delimiter=delimiter)
+    if rows and curie_column not in rows[0]:
+        raise click.ClickException(
+            f"--curie-column {curie_column!r} not found in CSV header. "
+            f"Available columns: {list(rows[0].keys())}"
+        )
+    for col_name, col_label in [
+        (label_column, "--label-column"),
+        (type_column, "--type-column"),
+        (equivalent_curie_column, "--equivalent-curie-column"),
+    ]:
+        if col_name and rows and col_name not in rows[0]:
+            raise click.ClickException(
+                f"{col_label} {col_name!r} not found in CSV header. "
+                f"Available columns: {list(rows[0].keys())}"
+            )
+
+    blocks, warnings = build_blocks(
+        rows,
+        curie_column=curie_column,
+        label_column=label_column,
+        type_column=type_column,
+        equivalent_curie_column=equivalent_curie_column,
+        emit_resolves=emit_resolves,
+        dedupe=dedupe,
+        skip_empty=skip_empty,
+    )
+
+    if not blocks:
+        raise click.ClickException(
+            "No assertions to emit — every row was skipped or no assertion "
+            "columns were given. Pass --label-column / --type-column / "
+            "--equivalent-curie-column, or use --resolves to force a Resolves "
+            "block on the CURIE column."
+        )
+
+    for w in warnings:
+        click.echo(w, err=True)
+
+    yaml_text = emit_yaml(blocks, fence=fence, header=header)
+    sys.stdout.write(yaml_text)
+
+    if target:
+        ini_path = targets_ini or _default_targets_ini()
+        nodenorm_url = load_nodenorm_url(target, ini_path)
+        nodenorm = CachedNodeNorm.from_url(nodenorm_url)
+        results = validate_blocks(blocks, nodenorm)
+        click.echo(format_report(results, target, nodenorm_url), err=True)
+
+
+if __name__ == "__main__":  # pragma: no cover
+    main()
diff --git a/tests/tools/test_csv_to_babeltests.py b/tests/tools/test_csv_to_babeltests.py
new file mode 100644
index 0000000..2f585ad
--- /dev/null
+++ b/tests/tools/test_csv_to_babeltests.py
@@ -0,0 +1,227 @@
+"""Unit tests for the csv-to-babeltests CLI helpers.
+
+These exercise the pure-function parts of the tool — CSV parsing, block
+construction, YAML emission, and round-trip parsing through the same regex
++ yaml.safe_load that the GitHub-issue test discovery uses. No network.
+"""
+
+import io
+import re
+from pathlib import Path
+
+import pytest
+import yaml
+
+from src.babel_validation.tools.csv_to_babeltests import (
+    BlockEntry,
+    build_blocks,
+    emit_yaml,
+    read_csv,
+)
+
+
+pytestmark = pytest.mark.unit
+
+
+# Same regex as src/babel_validation/sources/github/github_issues_test_cases.py
+GITHUB_YAML_PATTERN = re.compile(r"```yaml\s+babel_tests:\s+.*?\s+```", re.DOTALL)
+
+
+def _parse_emitted(yaml_text: str) -> dict:
+    """Run emit_yaml() output through the GitHub-issue parser path."""
+    match = GITHUB_YAML_PATTERN.search(yaml_text)
+    assert match is not None, f"emitted YAML didn't match the issue regex:\n{yaml_text}"
+    return yaml.safe_load(
+        match.group(0).removeprefix("```yaml").removesuffix("```")
+    )
+
+
+# --- read_csv --------------------------------------------------------------
+
+def test_read_csv_basic(tmp_path: Path):
+    p = tmp_path / "in.csv"
+    p.write_text("CURIE,Label\nCHEBI:15365,aspirin\nMONDO:1,asthma\n", encoding="utf-8")
+    rows = read_csv(p)
+    assert rows == [
+        {"CURIE": "CHEBI:15365", "Label": "aspirin"},
+        {"CURIE": "MONDO:1", "Label": "asthma"},
+    ]
+
+
+def test_read_csv_strips_bom(tmp_path: Path):
+    p = tmp_path / "in.csv"
+    p.write_bytes("﻿CURIE,Label\nCHEBI:1,foo\n".encode("utf-8"))
+    rows = read_csv(p)
+    assert rows[0]["CURIE"] == "CHEBI:1"
+
+
+def test_read_csv_handles_quoted_field_with_comma(tmp_path: Path):
+    p = tmp_path / "in.csv"
+    p.write_text(
+        'CURIE,"Long, Name"\nCHEBI:1,"foo, bar"\n', encoding="utf-8"
+    )
+    rows = read_csv(p)
+    assert rows[0] == {"CURIE": "CHEBI:1", "Long, Name": "foo, bar"}
+
+
+# --- build_blocks ----------------------------------------------------------
+
+ROWS = [
+    {"OutputID": "CHEBI:15365", "Label": "aspirin",   "Type": "biolink:SmallMolecule", "Equiv": "PUBCHEM.COMPOUND:1"},
+    {"OutputID": "MONDO:0005015", "Label": "diabetes", "Type": "biolink:Disease",       "Equiv": ""},
+    {"OutputID": "",              "Label": "missing",  "Type": "",                       "Equiv": ""},
+    {"OutputID": "CHEBI:15365", "Label": "aspirin",   "Type": "biolink:SmallMolecule", "Equiv": "PUBCHEM.COMPOUND:1"},  # dup
+]
+
+
+def test_build_blocks_haslabel_only():
+    blocks, warnings = build_blocks(
+        ROWS,
+        curie_column="OutputID", label_column="Label",
+        type_column=None, equivalent_curie_column=None,
+        emit_resolves=False, dedupe=False, skip_empty=True,
+    )
+    assert set(blocks) == {"HasLabel"}
+    assert [e.param_set for e in blocks["HasLabel"]] == [
+        ["CHEBI:15365", "aspirin"],
+        ["MONDO:0005015", "diabetes"],
+        ["CHEBI:15365", "aspirin"],   # dup kept (dedupe=False)
+    ]
+    # row 4 (offset 2 → row index 4) was empty CURIE.
+    assert any("row 4" in w and "OutputID" in w for w in warnings)
+
+
+def test_build_blocks_dedupe_drops_duplicate_param_sets():
+    blocks, _ = build_blocks(
+        ROWS,
+        curie_column="OutputID", label_column="Label",
+        type_column=None, equivalent_curie_column=None,
+        emit_resolves=False, dedupe=True, skip_empty=True,
+    )
+    assert [e.param_set for e in blocks["HasLabel"]] == [
+        ["CHEBI:15365", "aspirin"],
+        ["MONDO:0005015", "diabetes"],
+    ]
+
+
+def test_build_blocks_multiple_assertions():
+    blocks, _ = build_blocks(
+        ROWS,
+        curie_column="OutputID", label_column="Label",
+        type_column="Type", equivalent_curie_column="Equiv",
+        emit_resolves=True, dedupe=True, skip_empty=True,
+    )
+    # All four assertion blocks should be present.
+    assert set(blocks) == {"HasLabel", "ResolvesWithType", "ResolvesWith", "Resolves"}
+    # ResolvesWithType places the Biolink type first.
+    rwt = [e.param_set for e in blocks["ResolvesWithType"]]
+    assert rwt == [
+        ["biolink:SmallMolecule", "CHEBI:15365"],
+        ["biolink:Disease", "MONDO:0005015"],
+    ]
+    # ResolvesWith only fires when Equiv is non-empty.
+    rw = [e.param_set for e in blocks["ResolvesWith"]]
+    assert rw == [["CHEBI:15365", "PUBCHEM.COMPOUND:1"]]
+
+
+def test_build_blocks_row_indices_match_spreadsheet_rows():
+    """row_idx should be 1-based with the header at row 1 — matches spreadsheet UIs."""
+    blocks, _ = build_blocks(
+        ROWS,
+        curie_column="OutputID", label_column="Label",
+        type_column=None, equivalent_curie_column=None,
+        emit_resolves=False, dedupe=False, skip_empty=True,
+    )
+    indices = [e.row_idx for e in blocks["HasLabel"]]
+    # ROWS[0] → row 2, ROWS[1] → row 3, ROWS[3] → row 5 (ROWS[2] was empty).
+    assert indices == [2, 3, 5]
+
+
+# --- emit_yaml -------------------------------------------------------------
+
+def test_emit_yaml_round_trips_through_github_parser():
+    blocks = {
+        "HasLabel": [
+            BlockEntry(2, ["CHEBI:15365", "aspirin"]),
+            BlockEntry(3, ["MONDO:0005015", "type 2 diabetes mellitus"]),
+        ],
+        "ResolvesWithType": [
+            BlockEntry(2, ["biolink:SmallMolecule", "CHEBI:15365"]),
+        ],
+    }
+    out = emit_yaml(blocks, fence=True, header=None)
+    parsed = _parse_emitted(out)
+    assert list(parsed["babel_tests"].keys()) == ["HasLabel", "ResolvesWithType"]
+    assert parsed["babel_tests"]["HasLabel"] == [
+        ["CHEBI:15365", "aspirin"],
+        ["MONDO:0005015", "type 2 diabetes mellitus"],
+    ]
+    assert parsed["babel_tests"]["ResolvesWithType"] == [
+        ["biolink:SmallMolecule", "CHEBI:15365"],
+    ]
+
+
+def test_emit_yaml_with_special_characters():
+    """Labels with apostrophes, commas, brackets must round-trip."""
+    blocks = {
+        "HasLabel": [
+            BlockEntry(2, ["DRUGBANK:DB00001", "Adenosine 5'-phosphosulfate"]),
+            BlockEntry(3, ["CHEBI:1", "foo, bar [baz]"]),
+            BlockEntry(4, ["CHEBI:2", ""]),  # empty label edge case
+        ],
+    }
+    out = emit_yaml(blocks)
+    parsed = _parse_emitted(out)
+    assert parsed["babel_tests"]["HasLabel"] == [
+        ["DRUGBANK:DB00001", "Adenosine 5'-phosphosulfate"],
+        ["CHEBI:1", "foo, bar [baz]"],
+        ["CHEBI:2", ""],
+    ]
+
+
+def test_emit_yaml_no_fence():
+    blocks = {"Resolves": [BlockEntry(2, ["CHEBI:15365"])]}
+    out = emit_yaml(blocks, fence=False)
+    assert "```" not in out
+    assert "babel_tests:" in out
+    # Without the fence the GitHub parser regex shouldn't find a match.
+    assert GITHUB_YAML_PATTERN.search(out) is None
+
+
+def test_emit_yaml_with_header_comment():
+    blocks = {"Resolves": [BlockEntry(2, ["CHEBI:15365"])]}
+    out = emit_yaml(blocks, header="from data/test-assets.csv")
+    assert out.startswith("# from data/test-assets.csv\n")
+    # Header doesn't break the round-trip.
+    parsed = _parse_emitted(out)
+    # Single-element param_sets are emitted as bare strings (matching the
+    # convention in assertions/nodenorm.py); the GitHub parser later wraps
+    # them back into [["CHEBI:15365"]].
+    assert parsed["babel_tests"]["Resolves"] == ["CHEBI:15365"]
+
+
+def test_emit_yaml_single_element_param_sets_emit_as_bare_strings():
+    """Single-element param_sets should match the existing YAML examples in
+    assertions/nodenorm.py — bare strings, not single-item flow lists."""
+    blocks = {"Resolves": [BlockEntry(2, ["CHEBI:15365"]), BlockEntry(3, ["MONDO:1"])]}
+    out = emit_yaml(blocks, fence=False)
+    # Bare-string form, not single-element flow lists.
+    assert "- CHEBI:15365" in out
+    assert "- [CHEBI:15365]" not in out
+
+
+# --- assertion-name compatibility with ASSERTION_HANDLERS ------------------
+
+def test_emitted_assertion_names_match_handler_registry():
+    """The names build_blocks emits must each resolve in ASSERTION_HANDLERS."""
+    from src.babel_validation.assertions import ASSERTION_HANDLERS
+    blocks, _ = build_blocks(
+        ROWS,
+        curie_column="OutputID", label_column="Label",
+        type_column="Type", equivalent_curie_column="Equiv",
+        emit_resolves=True, dedupe=True, skip_empty=True,
+    )
+    for assertion in blocks:
+        assert assertion.lower() in ASSERTION_HANDLERS, (
+            f"build_blocks emitted unknown assertion name {assertion!r}"
+        )
diff --git a/uv.lock b/uv.lock
index 3559d52..6d3c2f6 100644
--- a/uv.lock
+++ b/uv.lock
@@ -23,9 +23,10 @@ wheels = [
 [[package]]
 name = "babel-validation"
 version = "0.1.0"
-source = { virtual = "." }
+source = { editable = "." }
 dependencies = [
     { name = "black" },
+    { name = "click" },
     { name = "deepdiff" },
     { name = "filelock" },
     { name = "openapi-spec-validator" },
@@ -34,6 +35,7 @@ dependencies = [
     { name = "pytest-timeout" },
     { name = "pytest-xdist", extra = ["psutil"] },
     { name = "python-dotenv" },
+    { name = "pyyaml" },
     { name = "requests" },
     { name = "tqdm" },
 ]
@@ -41,6 +43,7 @@ dependencies = [
 [package.metadata]
 requires-dist = [
     { name = "black", specifier = ">=25.9.0" },
+    { name = "click", specifier = ">=8.1" },
     { name = "deepdiff", specifier = ">=8.6.1" },
     { name = "filelock" },
     { name = "openapi-spec-validator", specifier = ">=0.7.2" },
@@ -49,6 +52,7 @@ requires-dist = [
     { name = "pytest-timeout", specifier = ">=2.4.0" },
     { name = "pytest-xdist", extras = ["psutil"] },
     { name = "python-dotenv", specifier = ">=0.9.9" },
+    { name = "pyyaml", specifier = ">=6.0" },
     { name = "requests", specifier = ">=2.32.5" },
     { name = "tqdm", specifier = ">=4.67.1" },
 ]

From 97bd08f0288c4c79239ad5e949942779aa9525ac Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@renci.org>
Date: Mon, 4 May 2026 19:21:50 -0400
Subject: [PATCH 2/2] Add --from-google-sheet, --from-yaml, and data-asset unit
 tests to csv-to-babeltests.

- --from-google-sheet URL: download any Google Sheet CSV-export URL and run
  the same downstream pipeline as a local file
- --from-yaml FILE (reverse mode): parse an existing babel_tests: YAML block
  and validate it against --target, printing a report to stderr
- INPUT_CSV, --from-google-sheet, and --from-yaml are mutually exclusive
- Add tests/data/csv_to_babeltests_fixture.csv as a committed data asset with
  tricky label cases (commas, apostrophes, brackets), an empty CURIE row, and
  a duplicate; extend it to cover new edge cases as they arise
- 11 new unit tests covering the fixture CSV pipeline and parse_yaml_blocks;
  all 24 tests pass
- Unblock tests/data/ from .gitignore so fixture files can be committed

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .gitignore                                    |   2 +
 .../tools/csv_to_babeltests.py                | 143 +++++++++++--
 tests/data/csv_to_babeltests_fixture.csv      |  11 +
 tests/tools/test_csv_to_babeltests.py         | 188 ++++++++++++++++++
 4 files changed, 332 insertions(+), 12 deletions(-)
 create mode 100644 tests/data/csv_to_babeltests_fixture.csv

diff --git a/.gitignore b/.gitignore
index 4bb67d4..01ec95e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,6 +3,8 @@
 
 # Ignore all data files.
 data/
+# But committed test fixtures in tests/data/ should be tracked.
+!tests/data/
 
 # From GitHub
 *.class
diff --git a/src/babel_validation/tools/csv_to_babeltests.py b/src/babel_validation/tools/csv_to_babeltests.py
index 668456d..166e979 100644
--- a/src/babel_validation/tools/csv_to_babeltests.py
+++ b/src/babel_validation/tools/csv_to_babeltests.py
@@ -26,6 +26,7 @@
 from pathlib import Path
 
 import click
+import requests
 import yaml
 
 from src.babel_validation.assertions import ASSERTION_HANDLERS
@@ -89,6 +90,60 @@ def read_csv(path: Path | str, delimiter: str | None = None) -> list[dict[str, s
     return list(reader)
 
 
+def read_google_sheet(url: str) -> list[dict[str, str]]:
+    """Download a Google Sheet CSV-export URL and parse it as CSV rows.
+
+    Accepts any URL that returns CSV — typically the ``gviz/tq?tqx=out:csv``
+    export URL shown in a Google Sheet's *File → Share → Publish to web* dialog.
+    """
+    response = requests.get(url)
+    response.raise_for_status()
+    text = response.text
+    try:
+        delimiter = csv.Sniffer().sniff(text[:4096], delimiters=",\t;|").delimiter
+    except csv.Error:
+        delimiter = ","
+    reader = csv.DictReader(io.StringIO(text), delimiter=delimiter)
+    return list(reader)
+
+
+def parse_yaml_blocks(source: Path | str) -> dict[str, list[BlockEntry]]:
+    """Parse a ``babel_tests:`` YAML block back into a ``{assertion: [BlockEntry]}`` map.
+
+    ``source`` may be a file path or ``'-'`` for stdin.  Each entry gets a
+    synthetic row index (1-based position within its assertion list) since
+    there is no originating CSV.
+    """
+    if str(source) == "-":
+        text = sys.stdin.read()
+    else:
+        text = Path(source).read_text(encoding="utf-8")
+
+    data = yaml.safe_load(text)
+    if not isinstance(data, dict) or "babel_tests" not in data:
+        raise click.ClickException(
+            "Input does not contain a top-level 'babel_tests:' key. "
+            "Expected YAML of the form:\n  babel_tests:\n    HasLabel:\n    - [CURIE, label]"
+        )
+    raw = data["babel_tests"]
+    if not isinstance(raw, dict):
+        raise click.ClickException("'babel_tests' value must be a mapping of assertion names to lists.")
+
+    blocks: dict[str, list[BlockEntry]] = {}
+    for assertion, entries in raw.items():
+        if not isinstance(entries, list):
+            raise click.ClickException(f"babel_tests.{assertion} must be a list.")
+        block: list[BlockEntry] = []
+        for idx, entry in enumerate(entries, start=1):
+            if isinstance(entry, list):
+                param_set = [str(v) for v in entry]
+            else:
+                param_set = [str(entry)]
+            block.append(BlockEntry(row_idx=idx, param_set=param_set))
+        blocks[assertion] = block
+    return blocks
+
+
 def build_blocks(
     rows: list[dict[str, str]],
     *,
@@ -308,11 +363,30 @@ def load_nodenorm_url(target_name: str, targets_ini_path: Path) -> str:
 @click.command(context_settings={"help_option_names": ["-h", "--help"]})
 @click.argument(
     "input_csv",
-    type=click.Path(exists=True, dir_okay=False, allow_dash=True, path_type=Path),
+    required=False,
+    default=None,
+    type=click.Path(exists=False, dir_okay=False, allow_dash=True, path_type=Path),
+)
+@click.option(
+    "--from-google-sheet", "google_sheet_url", default=None, metavar="URL",
+    help=(
+        "Download a Google Sheet CSV-export URL instead of reading a local file. "
+        "Use the 'File → Share → Publish to web' CSV URL, or any URL returning CSV. "
+        "Mutually exclusive with INPUT_CSV and --from-yaml."
+    ),
+)
+@click.option(
+    "--from-yaml", "from_yaml_path", default=None, metavar="FILE",
+    type=click.Path(exists=False, dir_okay=False, allow_dash=True, path_type=Path),
+    help=(
+        "Read an existing babel_tests: YAML block (file or '-' for stdin), validate "
+        "it against --target, and print a report. No YAML is written to stdout. "
+        "Requires --target. Mutually exclusive with INPUT_CSV and --from-google-sheet."
+    ),
 )
 @click.option(
-    "--curie-column", required=True,
-    help="Column name containing the primary CURIE.",
+    "--curie-column", default=None,
+    help="Column name containing the primary CURIE. Required for CSV/sheet input.",
 )
 @click.option(
     "--label-column", default=None,
@@ -351,8 +425,10 @@ def load_nodenorm_url(target_name: str, targets_ini_path: Path) -> str:
               help="Optional comment line emitted above the YAML block "
                    "(useful for recording provenance).")
 def main(
-    input_csv: Path,
-    curie_column: str,
+    input_csv: Path | None,
+    google_sheet_url: str | None,
+    from_yaml_path: Path | None,
+    curie_column: str | None,
     label_column: str | None,
     type_column: str | None,
     equivalent_curie_column: str | None,
@@ -365,16 +441,52 @@ def main(
     fence: bool,
     header: str | None,
 ) -> None:
-    """Convert INPUT_CSV into a BabelTests YAML block on stdout."""
+    """Convert INPUT_CSV (or a Google Sheet) into a BabelTests YAML block on stdout.
 
-    # Default behavior for --resolves: emit Resolves only when the user
-    # didn't ask for any of the other assertion blocks.
-    if emit_resolves_flag is None:
-        emit_resolves = not (label_column or type_column or equivalent_curie_column)
+    Exactly one input source must be given: INPUT_CSV, --from-google-sheet, or
+    --from-yaml.  With --from-yaml the tool runs in *reverse mode*: it reads an
+    existing YAML block, validates it against --target, and prints a report to
+    stderr without writing any YAML to stdout.
+    """
+
+    # --- Validate mutually-exclusive input sources ---
+    input_sources = [s for s in (input_csv, google_sheet_url, from_yaml_path) if s is not None]
+    if len(input_sources) == 0:
+        raise click.UsageError(
+            "Provide exactly one input source: INPUT_CSV, --from-google-sheet URL, "
+            "or --from-yaml FILE."
+        )
+    if len(input_sources) > 1:
+        raise click.UsageError(
+            "INPUT_CSV, --from-google-sheet, and --from-yaml are mutually exclusive."
+        )
+
+    # --- Reverse mode: --from-yaml ---
+    if from_yaml_path is not None:
+        if not target:
+            raise click.UsageError("--from-yaml requires --target to run validation.")
+        blocks = parse_yaml_blocks(from_yaml_path)
+        ini_path = targets_ini or _default_targets_ini()
+        nodenorm_url = load_nodenorm_url(target, ini_path)
+        nodenorm = CachedNodeNorm.from_url(nodenorm_url)
+        results = validate_blocks(blocks, nodenorm)
+        click.echo(format_report(results, target, nodenorm_url), err=True)
+        return
+
+    # --- CSV / Google Sheet mode ---
+    if not curie_column:
+        raise click.UsageError("--curie-column is required for CSV and Google Sheet input.")
+
+    if google_sheet_url is not None:
+        try:
+            rows = read_google_sheet(google_sheet_url)
+        except requests.HTTPError as exc:
+            raise click.ClickException(f"Failed to download Google Sheet: {exc}") from exc
     else:
-        emit_resolves = emit_resolves_flag
+        if not Path(str(input_csv)).exists() and str(input_csv) != "-":
+            raise click.ClickException(f"File not found: {input_csv}")
+        rows = read_csv(input_csv, delimiter=delimiter)
 
-    rows = read_csv(input_csv, delimiter=delimiter)
     if rows and curie_column not in rows[0]:
         raise click.ClickException(
             f"--curie-column {curie_column!r} not found in CSV header. "
@@ -391,6 +503,13 @@ def main(
                 f"Available columns: {list(rows[0].keys())}"
             )
 
+    # Default behavior for --resolves: emit Resolves only when the user
+    # didn't ask for any of the other assertion blocks.
+    if emit_resolves_flag is None:
+        emit_resolves = not (label_column or type_column or equivalent_curie_column)
+    else:
+        emit_resolves = emit_resolves_flag
+
     blocks, warnings = build_blocks(
         rows,
         curie_column=curie_column,
diff --git a/tests/data/csv_to_babeltests_fixture.csv b/tests/data/csv_to_babeltests_fixture.csv
new file mode 100644
index 0000000..f48889c
--- /dev/null
+++ b/tests/data/csv_to_babeltests_fixture.csv
@@ -0,0 +1,11 @@
+OutputID,Label,Type,Equiv,Notes
+CHEBI:15365,aspirin,biolink:SmallMolecule,PUBCHEM.COMPOUND:2244,basic case
+MONDO:0005015,type 2 diabetes mellitus,biolink:Disease,,no equiv CURIE
+HGNC:11998,TP53,biolink:Gene,NCBIGene:7157,gene with numeric NCBIGene equiv
+CHEBI:6801,"metformin",biolink:SmallMolecule,,quoted label
+CHEBI:27732,"caffeine, anhydrous",biolink:SmallMolecule,,label with comma (must stay quoted)
+DRUGBANK:DB00001,"Adenosine 5'-phosphosulfate",biolink:SmallMolecule,,apostrophe in label
+CHEBI:1,"foo [bar] (baz)",biolink:SmallMolecule,,brackets and parens in label
+,missing curie,biolink:Disease,,empty OutputID — should be skipped with a warning
+CHEBI:15365,aspirin,biolink:SmallMolecule,PUBCHEM.COMPOUND:2244,exact duplicate of row 2 — deduped when --dedupe
+CHEBI:17968,butyric acid,,CHEBI:17968,no type column value — ResolvesWithType should be skipped
diff --git a/tests/tools/test_csv_to_babeltests.py b/tests/tools/test_csv_to_babeltests.py
index 2f585ad..2d23e44 100644
--- a/tests/tools/test_csv_to_babeltests.py
+++ b/tests/tools/test_csv_to_babeltests.py
@@ -16,9 +16,13 @@
     BlockEntry,
     build_blocks,
     emit_yaml,
+    parse_yaml_blocks,
     read_csv,
 )
 
+# Path to the committed CSV fixture used by the data-asset tests.
+FIXTURE_CSV = Path(__file__).parent.parent / "data" / "csv_to_babeltests_fixture.csv"
+
 
 pytestmark = pytest.mark.unit
 
@@ -225,3 +229,187 @@ def test_emitted_assertion_names_match_handler_registry():
         assert assertion.lower() in ASSERTION_HANDLERS, (
             f"build_blocks emitted unknown assertion name {assertion!r}"
         )
+
+
+# --- CSV data-asset tests (tests/data/csv_to_babeltests_fixture.csv) --------
+# Add tricky cases to that file; these tests exercise the full pipeline from
+# CSV → build_blocks → emit_yaml → yaml.safe_load without hitting the network.
+
+def test_fixture_csv_exists():
+    assert FIXTURE_CSV.is_file(), f"Fixture CSV not found: {FIXTURE_CSV}"
+
+
+def test_fixture_csv_labels_round_trip():
+    """All non-empty Label values survive the full CSV → YAML → parse cycle."""
+    rows = read_csv(FIXTURE_CSV)
+    blocks, warnings = build_blocks(
+        rows,
+        curie_column="OutputID", label_column="Label",
+        type_column=None, equivalent_curie_column=None,
+        emit_resolves=False, dedupe=False, skip_empty=True,
+    )
+    assert "HasLabel" in blocks
+
+    out = emit_yaml(blocks, fence=True)
+    parsed = _parse_emitted(out)
+    label_pairs = parsed["babel_tests"]["HasLabel"]
+
+    # Every row with a non-empty OutputID and non-empty Label must appear.
+    expected = [
+        (r["OutputID"].strip(), r["Label"].strip())
+        for r in rows
+        if r["OutputID"].strip() and r["Label"].strip()
+    ]
+    actual = [(p[0], p[1]) for p in label_pairs]
+    assert actual == expected, "Label round-trip mismatch"
+
+
+def test_fixture_csv_tricky_labels():
+    """Labels that contain commas, apostrophes, and brackets must survive intact."""
+    rows = read_csv(FIXTURE_CSV)
+    blocks, _ = build_blocks(
+        rows,
+        curie_column="OutputID", label_column="Label",
+        type_column=None, equivalent_curie_column=None,
+        emit_resolves=False, dedupe=False, skip_empty=True,
+    )
+    out = emit_yaml(blocks, fence=True)
+    parsed = _parse_emitted(out)
+    pairs = {p[0]: p[1] for p in parsed["babel_tests"]["HasLabel"]}
+
+    assert pairs["CHEBI:27732"] == "caffeine, anhydrous"          # comma
+    assert pairs["DRUGBANK:DB00001"] == "Adenosine 5'-phosphosulfate"  # apostrophe
+    assert pairs["CHEBI:1"] == "foo [bar] (baz)"                  # brackets + parens
+
+
+def test_fixture_csv_empty_curie_produces_warning():
+    """A row with an empty OutputID must be skipped and produce a warning."""
+    rows = read_csv(FIXTURE_CSV)
+    _, warnings = build_blocks(
+        rows,
+        curie_column="OutputID", label_column="Label",
+        type_column=None, equivalent_curie_column=None,
+        emit_resolves=False, dedupe=False, skip_empty=True,
+    )
+    assert any("empty" in w and "OutputID" in w for w in warnings), (
+        f"Expected a warning about empty OutputID; got: {warnings}"
+    )
+
+
+def test_fixture_csv_dedupe_removes_duplicate_row():
+    """The fixture has one exact duplicate row; --dedupe should drop it."""
+    rows = read_csv(FIXTURE_CSV)
+    blocks_keep, _ = build_blocks(
+        rows,
+        curie_column="OutputID", label_column="Label",
+        type_column=None, equivalent_curie_column=None,
+        emit_resolves=False, dedupe=False, skip_empty=True,
+    )
+    blocks_dedup, _ = build_blocks(
+        rows,
+        curie_column="OutputID", label_column="Label",
+        type_column=None, equivalent_curie_column=None,
+        emit_resolves=False, dedupe=True, skip_empty=True,
+    )
+    # With dedupe=True there should be exactly one fewer HasLabel entry.
+    assert len(blocks_dedup["HasLabel"]) == len(blocks_keep["HasLabel"]) - 1
+
+
+def test_fixture_csv_skip_empty_type_suppresses_resolveswithtype():
+    """Rows with an empty Type column must not produce ResolvesWithType entries."""
+    rows = read_csv(FIXTURE_CSV)
+    blocks, _ = build_blocks(
+        rows,
+        curie_column="OutputID", label_column=None,
+        type_column="Type", equivalent_curie_column=None,
+        emit_resolves=False, dedupe=False, skip_empty=True,
+    )
+    rwt_curies = [e.param_set[1] for e in blocks.get("ResolvesWithType", [])]
+    # CHEBI:17968 has no Type value in the fixture; it must not appear.
+    assert "CHEBI:17968" not in rwt_curies
+
+
+# --- parse_yaml_blocks -------------------------------------------------------
+
+def test_parse_yaml_blocks_roundtrip(tmp_path: Path):
+    """parse_yaml_blocks ∘ emit_yaml should be idempotent."""
+    blocks_in = {
+        "HasLabel": [
+            BlockEntry(1, ["CHEBI:15365", "aspirin"]),
+            BlockEntry(2, ["MONDO:0005015", "type 2 diabetes mellitus"]),
+        ],
+        "ResolvesWithType": [
+            BlockEntry(1, ["biolink:SmallMolecule", "CHEBI:15365"]),
+        ],
+        "Resolves": [
+            BlockEntry(1, ["HGNC:11998"]),
+        ],
+    }
+    yaml_text = emit_yaml(blocks_in, fence=False)
+    p = tmp_path / "block.yaml"
+    p.write_text(yaml_text, encoding="utf-8")
+
+    blocks_out = parse_yaml_blocks(p)
+    assert set(blocks_out) == set(blocks_in)
+    for assertion in blocks_in:
+        in_sets = [e.param_set for e in blocks_in[assertion]]
+        out_sets = [e.param_set for e in blocks_out[assertion]]
+        assert out_sets == in_sets, f"param_sets differ for {assertion}"
+
+
+def test_parse_yaml_blocks_synthetic_row_indices(tmp_path: Path):
+    """Row indices assigned by parse_yaml_blocks should be 1-based per assertion."""
+    yaml_text = (
+        "babel_tests:\n"
+        "  HasLabel:\n"
+        "  - [CHEBI:15365, aspirin]\n"
+        "  - [MONDO:1, asthma]\n"
+    )
+    p = tmp_path / "block.yaml"
+    p.write_text(yaml_text, encoding="utf-8")
+    blocks = parse_yaml_blocks(p)
+    assert [e.row_idx for e in blocks["HasLabel"]] == [1, 2]
+
+
+def test_parse_yaml_blocks_single_element_entries(tmp_path: Path):
+    """Bare-string entries (Resolves: - CHEBI:15365) must parse to 1-element param_sets."""
+    yaml_text = (
+        "babel_tests:\n"
+        "  Resolves:\n"
+        "  - CHEBI:15365\n"
+        "  - MONDO:1\n"
+    )
+    p = tmp_path / "block.yaml"
+    p.write_text(yaml_text, encoding="utf-8")
+    blocks = parse_yaml_blocks(p)
+    assert [e.param_set for e in blocks["Resolves"]] == [["CHEBI:15365"], ["MONDO:1"]]
+
+
+def test_parse_yaml_blocks_missing_key_raises(tmp_path: Path):
+    """A YAML file without a babel_tests: key must raise ClickException."""
+    import click
+    p = tmp_path / "bad.yaml"
+    p.write_text("some_other_key:\n  - foo\n", encoding="utf-8")
+    with pytest.raises(click.ClickException, match="babel_tests"):
+        parse_yaml_blocks(p)
+
+
+def test_parse_yaml_blocks_tricky_labels(tmp_path: Path):
+    """Labels that were tricky in CSV (commas, apostrophes, brackets) must parse cleanly."""
+    yaml_text = emit_yaml(
+        {
+            "HasLabel": [
+                BlockEntry(1, ["CHEBI:27732", "caffeine, anhydrous"]),
+                BlockEntry(2, ["DRUGBANK:DB00001", "Adenosine 5'-phosphosulfate"]),
+                BlockEntry(3, ["CHEBI:1", "foo [bar] (baz)"]),
+            ]
+        },
+        fence=False,
+    )
+    p = tmp_path / "block.yaml"
+    p.write_text(yaml_text, encoding="utf-8")
+    blocks = parse_yaml_blocks(p)
+    pairs = {e.param_set[0]: e.param_set[1] for e in blocks["HasLabel"]}
+    assert pairs["CHEBI:27732"] == "caffeine, anhydrous"
+    assert pairs["DRUGBANK:DB00001"] == "Adenosine 5'-phosphosulfate"
+    assert pairs["CHEBI:1"] == "foo [bar] (baz)"