From 5ee0efa6ec61a24c6488cdb51362b327da409b18 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 4 May 2026 15:25:38 -0400 Subject: [PATCH 1/2] Add csv-to-babeltests CLI for converting CSV assets into BabelTests YAML. Translator collaborators sometimes circulate spreadsheets pairing a CURIE column with a label, equivalent CURIE, or Biolink type. This adds a click CLI that ingests such CSVs and emits a paste-ready YAML babel_tests block for a GitHub issue, optionally validating each row against a NodeNorm target from tests/targets.ini and reporting failures on stderr. The tool reuses ASSERTION_HANDLERS and CachedNodeNorm directly so the emitted YAML is guaranteed to match what GitHubIssuesTestCases parses, and per-assertion semantics never drift between the two code paths. Wires hatchling as the build backend (packaging src/ as-is so existing `from src.babel_validation.X` imports keep working) so the csv-to-babeltests console script can be exposed via [project.scripts]. Co-Authored-By: Claude Opus 4.7 --- pyproject.toml | 14 + src/babel_validation/tools/__init__.py | 0 .../tools/csv_to_babeltests.py | 428 ++++++++++++++++++ tests/tools/test_csv_to_babeltests.py | 227 ++++++++++ uv.lock | 6 +- 5 files changed, 674 insertions(+), 1 deletion(-) create mode 100644 src/babel_validation/tools/__init__.py create mode 100644 src/babel_validation/tools/csv_to_babeltests.py create mode 100644 tests/tools/test_csv_to_babeltests.py diff --git a/pyproject.toml b/pyproject.toml index 3cc1b9a..0f19386 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,6 +7,8 @@ readme = "README.md" requires-python = ">=3.11" dependencies = [ "black>=25.9.0", + "click>=8.1", + "pyyaml>=6.0", "requests>=2.32.5", "tqdm>=4.67.1", "filelock", @@ -22,6 +24,18 @@ dependencies = [ [project.urls] Repository = "https://github.com/TranslatorSRI/babel-validation" +[project.scripts] +csv-to-babeltests = "src.babel_validation.tools.csv_to_babeltests:main" + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.build.targets.wheel] +# Package the entire `src/` directory so existing imports +# (`from src.babel_validation.X import Y`) continue to work after install. +packages = ["src"] + [tool.pytest.ini_options] timeout = 300 markers = [ diff --git a/src/babel_validation/tools/__init__.py b/src/babel_validation/tools/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/babel_validation/tools/csv_to_babeltests.py b/src/babel_validation/tools/csv_to_babeltests.py new file mode 100644 index 0000000..668456d --- /dev/null +++ b/src/babel_validation/tools/csv_to_babeltests.py @@ -0,0 +1,428 @@ +"""csv-to-babeltests +==================== + +Convert a CSV of (CURIE, label/type/equivalent-CURIE) rows into a YAML +``babel_tests:`` block suitable for pasting into a GitHub issue, optionally +validating each row against a NodeNorm endpoint defined in +``tests/targets.ini``. + +The emitted YAML is the same format consumed by +``GitHubIssuesTestCases.get_test_issues_from_issue`` — assertion handlers and +YAML schema are reused, not duplicated. + +Run: + uv run csv-to-babeltests INPUT.csv --curie-column OutputID \\ + --label-column "Expected Result / Suggested Comparator" --target dev +""" + +from __future__ import annotations + +import configparser +import csv +import io +import sys +from collections import defaultdict +from dataclasses import dataclass, field +from pathlib import Path + +import click +import yaml + +from src.babel_validation.assertions import ASSERTION_HANDLERS +from src.babel_validation.core.testrow import TestStatus +from src.babel_validation.services.nodenorm import CachedNodeNorm + + +# --- YAML emission --------------------------------------------------------- + +class _FlowList(list): + """List subclass that yaml.safe_dump emits in inline flow style.""" + + +def _represent_flow_list(dumper, data): + return dumper.represent_sequence( + "tag:yaml.org,2002:seq", data, flow_style=True + ) + + +yaml.SafeDumper.add_representer(_FlowList, _represent_flow_list) + + +# --- Data structures ------------------------------------------------------- + +@dataclass +class BlockEntry: + """One assertion invocation: a param_set with provenance back to a CSV row.""" + row_idx: int # 1-based; header is row 1, first data row is row 2. + param_set: list[str] + + +@dataclass +class ValidationResult: + assertion: str + row_idx: int + param_set: list[str] + status: TestStatus + messages: list[str] = field(default_factory=list) + + +# --- Pure functions (testable without network) ----------------------------- + +def read_csv(path: Path | str, delimiter: str | None = None) -> list[dict[str, str]]: + """Read a CSV file (or '-' for stdin) into a list of dict rows. + + If ``delimiter`` is None we let csv.Sniffer guess from the first 4KiB, + falling back to ',' on failure. + """ + if str(path) == "-": + text = sys.stdin.read() + else: + text = Path(path).read_text(encoding="utf-8-sig") # strip any BOM + + if delimiter is None: + try: + delimiter = csv.Sniffer().sniff(text[:4096], delimiters=",\t;|").delimiter + except csv.Error: + delimiter = "," + + reader = csv.DictReader(io.StringIO(text), delimiter=delimiter) + return list(reader) + + +def build_blocks( + rows: list[dict[str, str]], + *, + curie_column: str, + label_column: str | None, + type_column: str | None, + equivalent_curie_column: str | None, + emit_resolves: bool, + dedupe: bool, + skip_empty: bool, +) -> tuple[dict[str, list[BlockEntry]], list[str]]: + """Turn CSV rows into ``{assertion_name: [BlockEntry, ...]}``. + + Returns ``(blocks, warnings)``. Warnings is a list of human-readable + strings the caller can dump to stderr (e.g. "row 17: empty OutputID"). + """ + blocks: dict[str, list[BlockEntry]] = defaultdict(list) + warnings: list[str] = [] + seen: dict[str, set[tuple[str, ...]]] = defaultdict(set) + + def add(assertion: str, param_set: list[str], row_idx: int) -> None: + key = tuple(param_set) + if dedupe and key in seen[assertion]: + return + seen[assertion].add(key) + blocks[assertion].append(BlockEntry(row_idx=row_idx, param_set=param_set)) + + for offset, row in enumerate(rows): + row_idx = offset + 2 # match what spreadsheet UIs show: header is row 1. + + curie = (row.get(curie_column) or "").strip() + if not curie: + warnings.append(f"row {row_idx}: empty {curie_column!r} — skipping") + continue + + if label_column is not None: + label = (row.get(label_column) or "").strip() + if not label and skip_empty: + warnings.append(f"row {row_idx}: empty {label_column!r} — skipping HasLabel") + else: + add("HasLabel", [curie, label], row_idx) + + if type_column is not None: + biolink_type = (row.get(type_column) or "").strip() + if not biolink_type and skip_empty: + warnings.append(f"row {row_idx}: empty {type_column!r} — skipping ResolvesWithType") + else: + add("ResolvesWithType", [biolink_type, curie], row_idx) + + if equivalent_curie_column is not None: + equiv = (row.get(equivalent_curie_column) or "").strip() + if not equiv and skip_empty: + warnings.append(f"row {row_idx}: empty {equivalent_curie_column!r} — skipping ResolvesWith") + else: + add("ResolvesWith", [curie, equiv], row_idx) + + if emit_resolves: + add("Resolves", [curie], row_idx) + + return dict(blocks), warnings + + +def emit_yaml( + blocks: dict[str, list[BlockEntry]], + *, + fence: bool = True, + header: str | None = None, +) -> str: + """Render a ``{assertion_name: [BlockEntry, ...]}`` map as a YAML block. + + Each param_set is emitted as an inline flow list so the output reads like + the existing examples in ``assertions/nodenorm.py``:: + + babel_tests: + HasLabel: + - [CHEBI:15365, aspirin] + + A round-trip self-check via ``yaml.safe_load`` ensures we never emit + something the GitHub-issue parser would reject. + """ + # Match the convention used in src/babel_validation/assertions/nodenorm.py: + # single-element param_sets are emitted as bare strings ("- CHEBI:15365"), + # multi-element ones as inline flow lists ("- [CHEBI:15365, aspirin]"). + # Both parse to the same param_set via the GitHub issue loader. + data = { + "babel_tests": { + assertion: [ + e.param_set[0] if len(e.param_set) == 1 else _FlowList(e.param_set) + for e in entries + ] + for assertion, entries in blocks.items() + } + } + body = yaml.safe_dump( + data, + sort_keys=False, + default_flow_style=False, + allow_unicode=True, + width=10_000, + ) + + # Round-trip self-check; raises if our output isn't loadable. + yaml.safe_load(body) + + parts: list[str] = [] + if header: + parts.append(f"# {header}") + if fence: + parts.append("```yaml") + parts.append(body.rstrip()) + if fence: + parts.append("```") + return "\n".join(parts) + "\n" + + +# --- Validation ------------------------------------------------------------ + +def validate_blocks( + blocks: dict[str, list[BlockEntry]], + nodenorm: CachedNodeNorm, +) -> list[ValidationResult]: + """Run each (assertion, param_set) through the matching ASSERTION_HANDLER. + + Param_sets are evaluated one at a time so each ``TestResult`` can be + attributed back to its source CSV row. ``CachedNodeNorm`` deduplicates + network calls per CURIE, so the per-row loop is no slower than batching. + """ + # Pre-warm the cache with every CURIE we'll need across all blocks. + all_curies: set[str] = set() + for assertion, entries in blocks.items(): + handler = ASSERTION_HANDLERS[assertion.lower()] + for entry in entries: + all_curies.update(handler.curie_params(entry.param_set)) + if all_curies: + nodenorm.normalize_curies(list(all_curies)) + + results: list[ValidationResult] = [] + for assertion, entries in blocks.items(): + handler = ASSERTION_HANDLERS[assertion.lower()] + for entry in entries: + test_results = list( + handler.test_with_nodenorm( + [entry.param_set], nodenorm, + label=f"row {entry.row_idx}", + ) + ) + if test_results and all(r.status == TestStatus.Passed for r in test_results): + status = TestStatus.Passed + messages: list[str] = [] + else: + status = TestStatus.Failed + messages = [r.message for r in test_results + if r.status != TestStatus.Passed] or ["no result"] + results.append(ValidationResult( + assertion=assertion, + row_idx=entry.row_idx, + param_set=entry.param_set, + status=status, + messages=messages, + )) + return results + + +def format_report( + results: list[ValidationResult], + target_name: str, + nodenorm_url: str, +) -> str: + """Human-readable validation summary, suitable for stderr.""" + by_assertion: dict[str, list[ValidationResult]] = defaultdict(list) + for r in results: + by_assertion[r.assertion].append(r) + + lines = [f"Validation against target {target_name!r} ({nodenorm_url}):"] + for assertion, rows in by_assertion.items(): + passed = sum(1 for r in rows if r.status == TestStatus.Passed) + failed = sum(1 for r in rows if r.status == TestStatus.Failed) + lines.append(f" {assertion}: {passed} passed, {failed} failed.") + for r in rows: + if r.status == TestStatus.Failed: + params_str = ", ".join(r.param_set) + lines.append( + f" FAIL row {r.row_idx} [{params_str}] → {r.messages[0]}" + ) + return "\n".join(lines) + "\n" + + +# --- targets.ini ----------------------------------------------------------- + +def _default_targets_ini() -> Path: + """Locate tests/targets.ini relative to this file's repo.""" + # src/babel_validation/tools/csv_to_babeltests.py → ../../../tests/targets.ini + return Path(__file__).resolve().parents[3] / "tests" / "targets.ini" + + +def load_nodenorm_url(target_name: str, targets_ini_path: Path) -> str: + """Look up ``NodeNormURL`` for ``target_name`` in ``targets_ini_path``.""" + if not targets_ini_path.is_file(): + raise click.ClickException(f"targets.ini not found at {targets_ini_path}") + cp = configparser.ConfigParser() + cp.read(targets_ini_path, encoding="utf-8") + if target_name not in cp: + raise click.ClickException( + f"target {target_name!r} not found in {targets_ini_path}; " + f"available: {', '.join(cp.sections())}" + ) + section = cp[target_name] + if "NodeNormURL" not in section: + raise click.ClickException( + f"target {target_name!r} in {targets_ini_path} has no NodeNormURL" + ) + return section["NodeNormURL"] + + +# --- CLI ------------------------------------------------------------------- + +@click.command(context_settings={"help_option_names": ["-h", "--help"]}) +@click.argument( + "input_csv", + type=click.Path(exists=True, dir_okay=False, allow_dash=True, path_type=Path), +) +@click.option( + "--curie-column", required=True, + help="Column name containing the primary CURIE.", +) +@click.option( + "--label-column", default=None, + help="Column with the expected label → emits a HasLabel block.", +) +@click.option( + "--type-column", default=None, + help="Column with a Biolink type (e.g. 'biolink:SmallMolecule') → emits a ResolvesWithType block.", +) +@click.option( + "--equivalent-curie-column", default=None, + help="Column with a CURIE that should merge to the same canonical id → emits a ResolvesWith block.", +) +@click.option( + "--resolves/--no-resolves", "emit_resolves_flag", default=None, + help=("Force-emit (or suppress) a Resolves block. Default: emit a Resolves " + "block only when no other assertion column was given."), +) +@click.option("--dedupe", is_flag=True, default=False, + help="Drop duplicate param_sets within each assertion block.") +@click.option("--skip-empty/--no-skip-empty", default=True, + help="Skip rows where the assertion column is blank (with a stderr warning).") +@click.option("--delimiter", default=None, + help="CSV delimiter (default: auto-detect via csv.Sniffer; falls back to comma).") +@click.option("--target", default=None, + help="Target name from targets.ini to validate against (e.g. dev, prod, ci).") +@click.option( + "--targets-ini", + type=click.Path(exists=False, dir_okay=False, path_type=Path), + default=None, + help="Override path to targets.ini (default: tests/targets.ini in the repo).", +) +@click.option("--fence/--no-fence", default=True, + help="Wrap output in ```yaml … ``` Markdown fences (default: on).") +@click.option("--header", default=None, + help="Optional comment line emitted above the YAML block " + "(useful for recording provenance).") +def main( + input_csv: Path, + curie_column: str, + label_column: str | None, + type_column: str | None, + equivalent_curie_column: str | None, + emit_resolves_flag: bool | None, + dedupe: bool, + skip_empty: bool, + delimiter: str | None, + target: str | None, + targets_ini: Path | None, + fence: bool, + header: str | None, +) -> None: + """Convert INPUT_CSV into a BabelTests YAML block on stdout.""" + + # Default behavior for --resolves: emit Resolves only when the user + # didn't ask for any of the other assertion blocks. + if emit_resolves_flag is None: + emit_resolves = not (label_column or type_column or equivalent_curie_column) + else: + emit_resolves = emit_resolves_flag + + rows = read_csv(input_csv, delimiter=delimiter) + if rows and curie_column not in rows[0]: + raise click.ClickException( + f"--curie-column {curie_column!r} not found in CSV header. " + f"Available columns: {list(rows[0].keys())}" + ) + for col_name, col_label in [ + (label_column, "--label-column"), + (type_column, "--type-column"), + (equivalent_curie_column, "--equivalent-curie-column"), + ]: + if col_name and rows and col_name not in rows[0]: + raise click.ClickException( + f"{col_label} {col_name!r} not found in CSV header. " + f"Available columns: {list(rows[0].keys())}" + ) + + blocks, warnings = build_blocks( + rows, + curie_column=curie_column, + label_column=label_column, + type_column=type_column, + equivalent_curie_column=equivalent_curie_column, + emit_resolves=emit_resolves, + dedupe=dedupe, + skip_empty=skip_empty, + ) + + if not blocks: + raise click.ClickException( + "No assertions to emit — every row was skipped or no assertion " + "columns were given. Pass --label-column / --type-column / " + "--equivalent-curie-column, or use --resolves to force a Resolves " + "block on the CURIE column." + ) + + for w in warnings: + click.echo(w, err=True) + + yaml_text = emit_yaml(blocks, fence=fence, header=header) + sys.stdout.write(yaml_text) + + if target: + ini_path = targets_ini or _default_targets_ini() + nodenorm_url = load_nodenorm_url(target, ini_path) + nodenorm = CachedNodeNorm.from_url(nodenorm_url) + results = validate_blocks(blocks, nodenorm) + click.echo(format_report(results, target, nodenorm_url), err=True) + + +if __name__ == "__main__": # pragma: no cover + main() diff --git a/tests/tools/test_csv_to_babeltests.py b/tests/tools/test_csv_to_babeltests.py new file mode 100644 index 0000000..2f585ad --- /dev/null +++ b/tests/tools/test_csv_to_babeltests.py @@ -0,0 +1,227 @@ +"""Unit tests for the csv-to-babeltests CLI helpers. + +These exercise the pure-function parts of the tool — CSV parsing, block +construction, YAML emission, and round-trip parsing through the same regex ++ yaml.safe_load that the GitHub-issue test discovery uses. No network. +""" + +import io +import re +from pathlib import Path + +import pytest +import yaml + +from src.babel_validation.tools.csv_to_babeltests import ( + BlockEntry, + build_blocks, + emit_yaml, + read_csv, +) + + +pytestmark = pytest.mark.unit + + +# Same regex as src/babel_validation/sources/github/github_issues_test_cases.py +GITHUB_YAML_PATTERN = re.compile(r"```yaml\s+babel_tests:\s+.*?\s+```", re.DOTALL) + + +def _parse_emitted(yaml_text: str) -> dict: + """Run emit_yaml() output through the GitHub-issue parser path.""" + match = GITHUB_YAML_PATTERN.search(yaml_text) + assert match is not None, f"emitted YAML didn't match the issue regex:\n{yaml_text}" + return yaml.safe_load( + match.group(0).removeprefix("```yaml").removesuffix("```") + ) + + +# --- read_csv -------------------------------------------------------------- + +def test_read_csv_basic(tmp_path: Path): + p = tmp_path / "in.csv" + p.write_text("CURIE,Label\nCHEBI:15365,aspirin\nMONDO:1,asthma\n", encoding="utf-8") + rows = read_csv(p) + assert rows == [ + {"CURIE": "CHEBI:15365", "Label": "aspirin"}, + {"CURIE": "MONDO:1", "Label": "asthma"}, + ] + + +def test_read_csv_strips_bom(tmp_path: Path): + p = tmp_path / "in.csv" + p.write_bytes("CURIE,Label\nCHEBI:1,foo\n".encode("utf-8")) + rows = read_csv(p) + assert rows[0]["CURIE"] == "CHEBI:1" + + +def test_read_csv_handles_quoted_field_with_comma(tmp_path: Path): + p = tmp_path / "in.csv" + p.write_text( + 'CURIE,"Long, Name"\nCHEBI:1,"foo, bar"\n', encoding="utf-8" + ) + rows = read_csv(p) + assert rows[0] == {"CURIE": "CHEBI:1", "Long, Name": "foo, bar"} + + +# --- build_blocks ---------------------------------------------------------- + +ROWS = [ + {"OutputID": "CHEBI:15365", "Label": "aspirin", "Type": "biolink:SmallMolecule", "Equiv": "PUBCHEM.COMPOUND:1"}, + {"OutputID": "MONDO:0005015", "Label": "diabetes", "Type": "biolink:Disease", "Equiv": ""}, + {"OutputID": "", "Label": "missing", "Type": "", "Equiv": ""}, + {"OutputID": "CHEBI:15365", "Label": "aspirin", "Type": "biolink:SmallMolecule", "Equiv": "PUBCHEM.COMPOUND:1"}, # dup +] + + +def test_build_blocks_haslabel_only(): + blocks, warnings = build_blocks( + ROWS, + curie_column="OutputID", label_column="Label", + type_column=None, equivalent_curie_column=None, + emit_resolves=False, dedupe=False, skip_empty=True, + ) + assert set(blocks) == {"HasLabel"} + assert [e.param_set for e in blocks["HasLabel"]] == [ + ["CHEBI:15365", "aspirin"], + ["MONDO:0005015", "diabetes"], + ["CHEBI:15365", "aspirin"], # dup kept (dedupe=False) + ] + # row 4 (offset 2 → row index 4) was empty CURIE. + assert any("row 4" in w and "OutputID" in w for w in warnings) + + +def test_build_blocks_dedupe_drops_duplicate_param_sets(): + blocks, _ = build_blocks( + ROWS, + curie_column="OutputID", label_column="Label", + type_column=None, equivalent_curie_column=None, + emit_resolves=False, dedupe=True, skip_empty=True, + ) + assert [e.param_set for e in blocks["HasLabel"]] == [ + ["CHEBI:15365", "aspirin"], + ["MONDO:0005015", "diabetes"], + ] + + +def test_build_blocks_multiple_assertions(): + blocks, _ = build_blocks( + ROWS, + curie_column="OutputID", label_column="Label", + type_column="Type", equivalent_curie_column="Equiv", + emit_resolves=True, dedupe=True, skip_empty=True, + ) + # All four assertion blocks should be present. + assert set(blocks) == {"HasLabel", "ResolvesWithType", "ResolvesWith", "Resolves"} + # ResolvesWithType places the Biolink type first. + rwt = [e.param_set for e in blocks["ResolvesWithType"]] + assert rwt == [ + ["biolink:SmallMolecule", "CHEBI:15365"], + ["biolink:Disease", "MONDO:0005015"], + ] + # ResolvesWith only fires when Equiv is non-empty. + rw = [e.param_set for e in blocks["ResolvesWith"]] + assert rw == [["CHEBI:15365", "PUBCHEM.COMPOUND:1"]] + + +def test_build_blocks_row_indices_match_spreadsheet_rows(): + """row_idx should be 1-based with the header at row 1 — matches spreadsheet UIs.""" + blocks, _ = build_blocks( + ROWS, + curie_column="OutputID", label_column="Label", + type_column=None, equivalent_curie_column=None, + emit_resolves=False, dedupe=False, skip_empty=True, + ) + indices = [e.row_idx for e in blocks["HasLabel"]] + # ROWS[0] → row 2, ROWS[1] → row 3, ROWS[3] → row 5 (ROWS[2] was empty). + assert indices == [2, 3, 5] + + +# --- emit_yaml ------------------------------------------------------------- + +def test_emit_yaml_round_trips_through_github_parser(): + blocks = { + "HasLabel": [ + BlockEntry(2, ["CHEBI:15365", "aspirin"]), + BlockEntry(3, ["MONDO:0005015", "type 2 diabetes mellitus"]), + ], + "ResolvesWithType": [ + BlockEntry(2, ["biolink:SmallMolecule", "CHEBI:15365"]), + ], + } + out = emit_yaml(blocks, fence=True, header=None) + parsed = _parse_emitted(out) + assert list(parsed["babel_tests"].keys()) == ["HasLabel", "ResolvesWithType"] + assert parsed["babel_tests"]["HasLabel"] == [ + ["CHEBI:15365", "aspirin"], + ["MONDO:0005015", "type 2 diabetes mellitus"], + ] + assert parsed["babel_tests"]["ResolvesWithType"] == [ + ["biolink:SmallMolecule", "CHEBI:15365"], + ] + + +def test_emit_yaml_with_special_characters(): + """Labels with apostrophes, commas, brackets must round-trip.""" + blocks = { + "HasLabel": [ + BlockEntry(2, ["DRUGBANK:DB00001", "Adenosine 5'-phosphosulfate"]), + BlockEntry(3, ["CHEBI:1", "foo, bar [baz]"]), + BlockEntry(4, ["CHEBI:2", ""]), # empty label edge case + ], + } + out = emit_yaml(blocks) + parsed = _parse_emitted(out) + assert parsed["babel_tests"]["HasLabel"] == [ + ["DRUGBANK:DB00001", "Adenosine 5'-phosphosulfate"], + ["CHEBI:1", "foo, bar [baz]"], + ["CHEBI:2", ""], + ] + + +def test_emit_yaml_no_fence(): + blocks = {"Resolves": [BlockEntry(2, ["CHEBI:15365"])]} + out = emit_yaml(blocks, fence=False) + assert "```" not in out + assert "babel_tests:" in out + # Without the fence the GitHub parser regex shouldn't find a match. + assert GITHUB_YAML_PATTERN.search(out) is None + + +def test_emit_yaml_with_header_comment(): + blocks = {"Resolves": [BlockEntry(2, ["CHEBI:15365"])]} + out = emit_yaml(blocks, header="from data/test-assets.csv") + assert out.startswith("# from data/test-assets.csv\n") + # Header doesn't break the round-trip. + parsed = _parse_emitted(out) + # Single-element param_sets are emitted as bare strings (matching the + # convention in assertions/nodenorm.py); the GitHub parser later wraps + # them back into [["CHEBI:15365"]]. + assert parsed["babel_tests"]["Resolves"] == ["CHEBI:15365"] + + +def test_emit_yaml_single_element_param_sets_emit_as_bare_strings(): + """Single-element param_sets should match the existing YAML examples in + assertions/nodenorm.py — bare strings, not single-item flow lists.""" + blocks = {"Resolves": [BlockEntry(2, ["CHEBI:15365"]), BlockEntry(3, ["MONDO:1"])]} + out = emit_yaml(blocks, fence=False) + # Bare-string form, not single-element flow lists. + assert "- CHEBI:15365" in out + assert "- [CHEBI:15365]" not in out + + +# --- assertion-name compatibility with ASSERTION_HANDLERS ------------------ + +def test_emitted_assertion_names_match_handler_registry(): + """The names build_blocks emits must each resolve in ASSERTION_HANDLERS.""" + from src.babel_validation.assertions import ASSERTION_HANDLERS + blocks, _ = build_blocks( + ROWS, + curie_column="OutputID", label_column="Label", + type_column="Type", equivalent_curie_column="Equiv", + emit_resolves=True, dedupe=True, skip_empty=True, + ) + for assertion in blocks: + assert assertion.lower() in ASSERTION_HANDLERS, ( + f"build_blocks emitted unknown assertion name {assertion!r}" + ) diff --git a/uv.lock b/uv.lock index 3559d52..6d3c2f6 100644 --- a/uv.lock +++ b/uv.lock @@ -23,9 +23,10 @@ wheels = [ [[package]] name = "babel-validation" version = "0.1.0" -source = { virtual = "." } +source = { editable = "." } dependencies = [ { name = "black" }, + { name = "click" }, { name = "deepdiff" }, { name = "filelock" }, { name = "openapi-spec-validator" }, @@ -34,6 +35,7 @@ dependencies = [ { name = "pytest-timeout" }, { name = "pytest-xdist", extra = ["psutil"] }, { name = "python-dotenv" }, + { name = "pyyaml" }, { name = "requests" }, { name = "tqdm" }, ] @@ -41,6 +43,7 @@ dependencies = [ [package.metadata] requires-dist = [ { name = "black", specifier = ">=25.9.0" }, + { name = "click", specifier = ">=8.1" }, { name = "deepdiff", specifier = ">=8.6.1" }, { name = "filelock" }, { name = "openapi-spec-validator", specifier = ">=0.7.2" }, @@ -49,6 +52,7 @@ requires-dist = [ { name = "pytest-timeout", specifier = ">=2.4.0" }, { name = "pytest-xdist", extras = ["psutil"] }, { name = "python-dotenv", specifier = ">=0.9.9" }, + { name = "pyyaml", specifier = ">=6.0" }, { name = "requests", specifier = ">=2.32.5" }, { name = "tqdm", specifier = ">=4.67.1" }, ] From 97bd08f0288c4c79239ad5e949942779aa9525ac Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 4 May 2026 19:21:50 -0400 Subject: [PATCH 2/2] Add --from-google-sheet, --from-yaml, and data-asset unit tests to csv-to-babeltests. - --from-google-sheet URL: download any Google Sheet CSV-export URL and run the same downstream pipeline as a local file - --from-yaml FILE (reverse mode): parse an existing babel_tests: YAML block and validate it against --target, printing a report to stderr - INPUT_CSV, --from-google-sheet, and --from-yaml are mutually exclusive - Add tests/data/csv_to_babeltests_fixture.csv as a committed data asset with tricky label cases (commas, apostrophes, brackets), an empty CURIE row, and a duplicate; extend it to cover new edge cases as they arise - 11 new unit tests covering the fixture CSV pipeline and parse_yaml_blocks; all 24 tests pass - Unblock tests/data/ from .gitignore so fixture files can be committed Co-Authored-By: Claude Sonnet 4.6 --- .gitignore | 2 + .../tools/csv_to_babeltests.py | 143 +++++++++++-- tests/data/csv_to_babeltests_fixture.csv | 11 + tests/tools/test_csv_to_babeltests.py | 188 ++++++++++++++++++ 4 files changed, 332 insertions(+), 12 deletions(-) create mode 100644 tests/data/csv_to_babeltests_fixture.csv diff --git a/.gitignore b/.gitignore index 4bb67d4..01ec95e 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,8 @@ # Ignore all data files. data/ +# But committed test fixtures in tests/data/ should be tracked. +!tests/data/ # From GitHub *.class diff --git a/src/babel_validation/tools/csv_to_babeltests.py b/src/babel_validation/tools/csv_to_babeltests.py index 668456d..166e979 100644 --- a/src/babel_validation/tools/csv_to_babeltests.py +++ b/src/babel_validation/tools/csv_to_babeltests.py @@ -26,6 +26,7 @@ from pathlib import Path import click +import requests import yaml from src.babel_validation.assertions import ASSERTION_HANDLERS @@ -89,6 +90,60 @@ def read_csv(path: Path | str, delimiter: str | None = None) -> list[dict[str, s return list(reader) +def read_google_sheet(url: str) -> list[dict[str, str]]: + """Download a Google Sheet CSV-export URL and parse it as CSV rows. + + Accepts any URL that returns CSV — typically the ``gviz/tq?tqx=out:csv`` + export URL shown in a Google Sheet's *File → Share → Publish to web* dialog. + """ + response = requests.get(url) + response.raise_for_status() + text = response.text + try: + delimiter = csv.Sniffer().sniff(text[:4096], delimiters=",\t;|").delimiter + except csv.Error: + delimiter = "," + reader = csv.DictReader(io.StringIO(text), delimiter=delimiter) + return list(reader) + + +def parse_yaml_blocks(source: Path | str) -> dict[str, list[BlockEntry]]: + """Parse a ``babel_tests:`` YAML block back into a ``{assertion: [BlockEntry]}`` map. + + ``source`` may be a file path or ``'-'`` for stdin. Each entry gets a + synthetic row index (1-based position within its assertion list) since + there is no originating CSV. + """ + if str(source) == "-": + text = sys.stdin.read() + else: + text = Path(source).read_text(encoding="utf-8") + + data = yaml.safe_load(text) + if not isinstance(data, dict) or "babel_tests" not in data: + raise click.ClickException( + "Input does not contain a top-level 'babel_tests:' key. " + "Expected YAML of the form:\n babel_tests:\n HasLabel:\n - [CURIE, label]" + ) + raw = data["babel_tests"] + if not isinstance(raw, dict): + raise click.ClickException("'babel_tests' value must be a mapping of assertion names to lists.") + + blocks: dict[str, list[BlockEntry]] = {} + for assertion, entries in raw.items(): + if not isinstance(entries, list): + raise click.ClickException(f"babel_tests.{assertion} must be a list.") + block: list[BlockEntry] = [] + for idx, entry in enumerate(entries, start=1): + if isinstance(entry, list): + param_set = [str(v) for v in entry] + else: + param_set = [str(entry)] + block.append(BlockEntry(row_idx=idx, param_set=param_set)) + blocks[assertion] = block + return blocks + + def build_blocks( rows: list[dict[str, str]], *, @@ -308,11 +363,30 @@ def load_nodenorm_url(target_name: str, targets_ini_path: Path) -> str: @click.command(context_settings={"help_option_names": ["-h", "--help"]}) @click.argument( "input_csv", - type=click.Path(exists=True, dir_okay=False, allow_dash=True, path_type=Path), + required=False, + default=None, + type=click.Path(exists=False, dir_okay=False, allow_dash=True, path_type=Path), +) +@click.option( + "--from-google-sheet", "google_sheet_url", default=None, metavar="URL", + help=( + "Download a Google Sheet CSV-export URL instead of reading a local file. " + "Use the 'File → Share → Publish to web' CSV URL, or any URL returning CSV. " + "Mutually exclusive with INPUT_CSV and --from-yaml." + ), +) +@click.option( + "--from-yaml", "from_yaml_path", default=None, metavar="FILE", + type=click.Path(exists=False, dir_okay=False, allow_dash=True, path_type=Path), + help=( + "Read an existing babel_tests: YAML block (file or '-' for stdin), validate " + "it against --target, and print a report. No YAML is written to stdout. " + "Requires --target. Mutually exclusive with INPUT_CSV and --from-google-sheet." + ), ) @click.option( - "--curie-column", required=True, - help="Column name containing the primary CURIE.", + "--curie-column", default=None, + help="Column name containing the primary CURIE. Required for CSV/sheet input.", ) @click.option( "--label-column", default=None, @@ -351,8 +425,10 @@ def load_nodenorm_url(target_name: str, targets_ini_path: Path) -> str: help="Optional comment line emitted above the YAML block " "(useful for recording provenance).") def main( - input_csv: Path, - curie_column: str, + input_csv: Path | None, + google_sheet_url: str | None, + from_yaml_path: Path | None, + curie_column: str | None, label_column: str | None, type_column: str | None, equivalent_curie_column: str | None, @@ -365,16 +441,52 @@ def main( fence: bool, header: str | None, ) -> None: - """Convert INPUT_CSV into a BabelTests YAML block on stdout.""" + """Convert INPUT_CSV (or a Google Sheet) into a BabelTests YAML block on stdout. - # Default behavior for --resolves: emit Resolves only when the user - # didn't ask for any of the other assertion blocks. - if emit_resolves_flag is None: - emit_resolves = not (label_column or type_column or equivalent_curie_column) + Exactly one input source must be given: INPUT_CSV, --from-google-sheet, or + --from-yaml. With --from-yaml the tool runs in *reverse mode*: it reads an + existing YAML block, validates it against --target, and prints a report to + stderr without writing any YAML to stdout. + """ + + # --- Validate mutually-exclusive input sources --- + input_sources = [s for s in (input_csv, google_sheet_url, from_yaml_path) if s is not None] + if len(input_sources) == 0: + raise click.UsageError( + "Provide exactly one input source: INPUT_CSV, --from-google-sheet URL, " + "or --from-yaml FILE." + ) + if len(input_sources) > 1: + raise click.UsageError( + "INPUT_CSV, --from-google-sheet, and --from-yaml are mutually exclusive." + ) + + # --- Reverse mode: --from-yaml --- + if from_yaml_path is not None: + if not target: + raise click.UsageError("--from-yaml requires --target to run validation.") + blocks = parse_yaml_blocks(from_yaml_path) + ini_path = targets_ini or _default_targets_ini() + nodenorm_url = load_nodenorm_url(target, ini_path) + nodenorm = CachedNodeNorm.from_url(nodenorm_url) + results = validate_blocks(blocks, nodenorm) + click.echo(format_report(results, target, nodenorm_url), err=True) + return + + # --- CSV / Google Sheet mode --- + if not curie_column: + raise click.UsageError("--curie-column is required for CSV and Google Sheet input.") + + if google_sheet_url is not None: + try: + rows = read_google_sheet(google_sheet_url) + except requests.HTTPError as exc: + raise click.ClickException(f"Failed to download Google Sheet: {exc}") from exc else: - emit_resolves = emit_resolves_flag + if not Path(str(input_csv)).exists() and str(input_csv) != "-": + raise click.ClickException(f"File not found: {input_csv}") + rows = read_csv(input_csv, delimiter=delimiter) - rows = read_csv(input_csv, delimiter=delimiter) if rows and curie_column not in rows[0]: raise click.ClickException( f"--curie-column {curie_column!r} not found in CSV header. " @@ -391,6 +503,13 @@ def main( f"Available columns: {list(rows[0].keys())}" ) + # Default behavior for --resolves: emit Resolves only when the user + # didn't ask for any of the other assertion blocks. + if emit_resolves_flag is None: + emit_resolves = not (label_column or type_column or equivalent_curie_column) + else: + emit_resolves = emit_resolves_flag + blocks, warnings = build_blocks( rows, curie_column=curie_column, diff --git a/tests/data/csv_to_babeltests_fixture.csv b/tests/data/csv_to_babeltests_fixture.csv new file mode 100644 index 0000000..f48889c --- /dev/null +++ b/tests/data/csv_to_babeltests_fixture.csv @@ -0,0 +1,11 @@ +OutputID,Label,Type,Equiv,Notes +CHEBI:15365,aspirin,biolink:SmallMolecule,PUBCHEM.COMPOUND:2244,basic case +MONDO:0005015,type 2 diabetes mellitus,biolink:Disease,,no equiv CURIE +HGNC:11998,TP53,biolink:Gene,NCBIGene:7157,gene with numeric NCBIGene equiv +CHEBI:6801,"metformin",biolink:SmallMolecule,,quoted label +CHEBI:27732,"caffeine, anhydrous",biolink:SmallMolecule,,label with comma (must stay quoted) +DRUGBANK:DB00001,"Adenosine 5'-phosphosulfate",biolink:SmallMolecule,,apostrophe in label +CHEBI:1,"foo [bar] (baz)",biolink:SmallMolecule,,brackets and parens in label +,missing curie,biolink:Disease,,empty OutputID — should be skipped with a warning +CHEBI:15365,aspirin,biolink:SmallMolecule,PUBCHEM.COMPOUND:2244,exact duplicate of row 2 — deduped when --dedupe +CHEBI:17968,butyric acid,,CHEBI:17968,no type column value — ResolvesWithType should be skipped diff --git a/tests/tools/test_csv_to_babeltests.py b/tests/tools/test_csv_to_babeltests.py index 2f585ad..2d23e44 100644 --- a/tests/tools/test_csv_to_babeltests.py +++ b/tests/tools/test_csv_to_babeltests.py @@ -16,9 +16,13 @@ BlockEntry, build_blocks, emit_yaml, + parse_yaml_blocks, read_csv, ) +# Path to the committed CSV fixture used by the data-asset tests. +FIXTURE_CSV = Path(__file__).parent.parent / "data" / "csv_to_babeltests_fixture.csv" + pytestmark = pytest.mark.unit @@ -225,3 +229,187 @@ def test_emitted_assertion_names_match_handler_registry(): assert assertion.lower() in ASSERTION_HANDLERS, ( f"build_blocks emitted unknown assertion name {assertion!r}" ) + + +# --- CSV data-asset tests (tests/data/csv_to_babeltests_fixture.csv) -------- +# Add tricky cases to that file; these tests exercise the full pipeline from +# CSV → build_blocks → emit_yaml → yaml.safe_load without hitting the network. + +def test_fixture_csv_exists(): + assert FIXTURE_CSV.is_file(), f"Fixture CSV not found: {FIXTURE_CSV}" + + +def test_fixture_csv_labels_round_trip(): + """All non-empty Label values survive the full CSV → YAML → parse cycle.""" + rows = read_csv(FIXTURE_CSV) + blocks, warnings = build_blocks( + rows, + curie_column="OutputID", label_column="Label", + type_column=None, equivalent_curie_column=None, + emit_resolves=False, dedupe=False, skip_empty=True, + ) + assert "HasLabel" in blocks + + out = emit_yaml(blocks, fence=True) + parsed = _parse_emitted(out) + label_pairs = parsed["babel_tests"]["HasLabel"] + + # Every row with a non-empty OutputID and non-empty Label must appear. + expected = [ + (r["OutputID"].strip(), r["Label"].strip()) + for r in rows + if r["OutputID"].strip() and r["Label"].strip() + ] + actual = [(p[0], p[1]) for p in label_pairs] + assert actual == expected, "Label round-trip mismatch" + + +def test_fixture_csv_tricky_labels(): + """Labels that contain commas, apostrophes, and brackets must survive intact.""" + rows = read_csv(FIXTURE_CSV) + blocks, _ = build_blocks( + rows, + curie_column="OutputID", label_column="Label", + type_column=None, equivalent_curie_column=None, + emit_resolves=False, dedupe=False, skip_empty=True, + ) + out = emit_yaml(blocks, fence=True) + parsed = _parse_emitted(out) + pairs = {p[0]: p[1] for p in parsed["babel_tests"]["HasLabel"]} + + assert pairs["CHEBI:27732"] == "caffeine, anhydrous" # comma + assert pairs["DRUGBANK:DB00001"] == "Adenosine 5'-phosphosulfate" # apostrophe + assert pairs["CHEBI:1"] == "foo [bar] (baz)" # brackets + parens + + +def test_fixture_csv_empty_curie_produces_warning(): + """A row with an empty OutputID must be skipped and produce a warning.""" + rows = read_csv(FIXTURE_CSV) + _, warnings = build_blocks( + rows, + curie_column="OutputID", label_column="Label", + type_column=None, equivalent_curie_column=None, + emit_resolves=False, dedupe=False, skip_empty=True, + ) + assert any("empty" in w and "OutputID" in w for w in warnings), ( + f"Expected a warning about empty OutputID; got: {warnings}" + ) + + +def test_fixture_csv_dedupe_removes_duplicate_row(): + """The fixture has one exact duplicate row; --dedupe should drop it.""" + rows = read_csv(FIXTURE_CSV) + blocks_keep, _ = build_blocks( + rows, + curie_column="OutputID", label_column="Label", + type_column=None, equivalent_curie_column=None, + emit_resolves=False, dedupe=False, skip_empty=True, + ) + blocks_dedup, _ = build_blocks( + rows, + curie_column="OutputID", label_column="Label", + type_column=None, equivalent_curie_column=None, + emit_resolves=False, dedupe=True, skip_empty=True, + ) + # With dedupe=True there should be exactly one fewer HasLabel entry. + assert len(blocks_dedup["HasLabel"]) == len(blocks_keep["HasLabel"]) - 1 + + +def test_fixture_csv_skip_empty_type_suppresses_resolveswithtype(): + """Rows with an empty Type column must not produce ResolvesWithType entries.""" + rows = read_csv(FIXTURE_CSV) + blocks, _ = build_blocks( + rows, + curie_column="OutputID", label_column=None, + type_column="Type", equivalent_curie_column=None, + emit_resolves=False, dedupe=False, skip_empty=True, + ) + rwt_curies = [e.param_set[1] for e in blocks.get("ResolvesWithType", [])] + # CHEBI:17968 has no Type value in the fixture; it must not appear. + assert "CHEBI:17968" not in rwt_curies + + +# --- parse_yaml_blocks ------------------------------------------------------- + +def test_parse_yaml_blocks_roundtrip(tmp_path: Path): + """parse_yaml_blocks ∘ emit_yaml should be idempotent.""" + blocks_in = { + "HasLabel": [ + BlockEntry(1, ["CHEBI:15365", "aspirin"]), + BlockEntry(2, ["MONDO:0005015", "type 2 diabetes mellitus"]), + ], + "ResolvesWithType": [ + BlockEntry(1, ["biolink:SmallMolecule", "CHEBI:15365"]), + ], + "Resolves": [ + BlockEntry(1, ["HGNC:11998"]), + ], + } + yaml_text = emit_yaml(blocks_in, fence=False) + p = tmp_path / "block.yaml" + p.write_text(yaml_text, encoding="utf-8") + + blocks_out = parse_yaml_blocks(p) + assert set(blocks_out) == set(blocks_in) + for assertion in blocks_in: + in_sets = [e.param_set for e in blocks_in[assertion]] + out_sets = [e.param_set for e in blocks_out[assertion]] + assert out_sets == in_sets, f"param_sets differ for {assertion}" + + +def test_parse_yaml_blocks_synthetic_row_indices(tmp_path: Path): + """Row indices assigned by parse_yaml_blocks should be 1-based per assertion.""" + yaml_text = ( + "babel_tests:\n" + " HasLabel:\n" + " - [CHEBI:15365, aspirin]\n" + " - [MONDO:1, asthma]\n" + ) + p = tmp_path / "block.yaml" + p.write_text(yaml_text, encoding="utf-8") + blocks = parse_yaml_blocks(p) + assert [e.row_idx for e in blocks["HasLabel"]] == [1, 2] + + +def test_parse_yaml_blocks_single_element_entries(tmp_path: Path): + """Bare-string entries (Resolves: - CHEBI:15365) must parse to 1-element param_sets.""" + yaml_text = ( + "babel_tests:\n" + " Resolves:\n" + " - CHEBI:15365\n" + " - MONDO:1\n" + ) + p = tmp_path / "block.yaml" + p.write_text(yaml_text, encoding="utf-8") + blocks = parse_yaml_blocks(p) + assert [e.param_set for e in blocks["Resolves"]] == [["CHEBI:15365"], ["MONDO:1"]] + + +def test_parse_yaml_blocks_missing_key_raises(tmp_path: Path): + """A YAML file without a babel_tests: key must raise ClickException.""" + import click + p = tmp_path / "bad.yaml" + p.write_text("some_other_key:\n - foo\n", encoding="utf-8") + with pytest.raises(click.ClickException, match="babel_tests"): + parse_yaml_blocks(p) + + +def test_parse_yaml_blocks_tricky_labels(tmp_path: Path): + """Labels that were tricky in CSV (commas, apostrophes, brackets) must parse cleanly.""" + yaml_text = emit_yaml( + { + "HasLabel": [ + BlockEntry(1, ["CHEBI:27732", "caffeine, anhydrous"]), + BlockEntry(2, ["DRUGBANK:DB00001", "Adenosine 5'-phosphosulfate"]), + BlockEntry(3, ["CHEBI:1", "foo [bar] (baz)"]), + ] + }, + fence=False, + ) + p = tmp_path / "block.yaml" + p.write_text(yaml_text, encoding="utf-8") + blocks = parse_yaml_blocks(p) + pairs = {e.param_set[0]: e.param_set[1] for e in blocks["HasLabel"]} + assert pairs["CHEBI:27732"] == "caffeine, anhydrous" + assert pairs["DRUGBANK:DB00001"] == "Adenosine 5'-phosphosulfate" + assert pairs["CHEBI:1"] == "foo [bar] (baz)"