From 3f575c4f20cf69b663a90b8410076c9b24efcb6c Mon Sep 17 00:00:00 2001 From: Genmin Date: Fri, 1 May 2026 20:57:08 -0700 Subject: [PATCH] Add runtime provenance to benchmark metadata --- policybench/cli.py | 2 ++ policybench/eval_no_tools.py | 4 +++ policybench/provenance.py | 55 ++++++++++++++++++++++++++++++++++++ tests/test_eval_no_tools.py | 3 ++ tests/test_provenance.py | 34 ++++++++++++++++++++++ 5 files changed, 98 insertions(+) create mode 100644 policybench/provenance.py create mode 100644 tests/test_provenance.py diff --git a/policybench/cli.py b/policybench/cli.py index d00f538..990b214 100644 --- a/policybench/cli.py +++ b/policybench/cli.py @@ -448,6 +448,7 @@ def main(): if args.command in {"reference-outputs", "ground-truth"}: from policybench.ground_truth import calculate_ground_truth from policybench.policyengine_runtime import runtime_metadata_for_country + from policybench.provenance import runtime_provenance from policybench.scenarios import ( generate_scenarios, get_uk_dataset_path, @@ -488,6 +489,7 @@ def main(): "programs": sorted(programs), "output": args.output, "scenario_manifest_output": args.scenario_manifest_output, + "runtime_environment": runtime_provenance(), **runtime_metadata_for_country( args.country, source_dataset_path=source_dataset_path, diff --git a/policybench/eval_no_tools.py b/policybench/eval_no_tools.py index 2a18a07..6a69fbf 100644 --- a/policybench/eval_no_tools.py +++ b/policybench/eval_no_tools.py @@ -7,6 +7,7 @@ import signal import threading import time +from datetime import datetime, timezone from pathlib import Path from typing import Iterable @@ -22,6 +23,7 @@ make_no_tools_batch_prompt, make_no_tools_batch_repair_prompt, ) +from policybench.provenance import runtime_provenance from policybench.scenarios import Scenario, scenario_to_dict from policybench.spec import expand_programs_for_scenario @@ -1419,6 +1421,7 @@ def _build_resume_metadata( return { "metadata_version": RESUME_METADATA_VERSION, "task": task, + "generated_at_utc": datetime.now(timezone.utc).isoformat(), "run_id": run_id, "include_explanations": include_explanations, "scenario_count": len(scenarios), @@ -1426,6 +1429,7 @@ def _build_resume_metadata( "programs": sorted(programs), "models": {name: models[name] for name in sorted(models)}, "policyengine_bundles": policyengine_bundles_for_countries(countries), + "runtime_environment": runtime_provenance(), } diff --git a/policybench/provenance.py b/policybench/provenance.py new file mode 100644 index 0000000..cde51dd --- /dev/null +++ b/policybench/provenance.py @@ -0,0 +1,55 @@ +"""Runtime provenance helpers for benchmark artifacts.""" + +from __future__ import annotations + +import platform +import sys +from hashlib import sha256 +from importlib import metadata +from pathlib import Path + +PROVENANCE_PACKAGES = ( + "litellm", + "numpy", + "pandas", + "policyengine", + "policyengine-us", + "policyengine-uk", +) + + +def installed_package_versions( + packages: tuple[str, ...] = PROVENANCE_PACKAGES, +) -> dict[str, str]: + """Return installed package versions for packages relevant to a run.""" + versions = {} + for package in packages: + try: + versions[package] = metadata.version(package) + except metadata.PackageNotFoundError: + continue + return versions + + +def runtime_provenance() -> dict: + """Return serializable Python and dependency provenance.""" + return { + "python": { + "version": platform.python_version(), + "implementation": platform.python_implementation(), + "executable": sys.executable, + }, + "packages": installed_package_versions(), + "lockfiles": dependency_lockfile_hashes(), + } + + +def dependency_lockfile_hashes(root: Path | None = None) -> dict[str, str]: + """Return hashes for dependency lockfiles committed with the repo.""" + root = Path(__file__).resolve().parents[1] if root is None else Path(root) + lockfiles = {} + for filename in ("uv.lock",): + path = root / filename + if path.exists(): + lockfiles[filename] = sha256(path.read_bytes()).hexdigest() + return lockfiles diff --git a/tests/test_eval_no_tools.py b/tests/test_eval_no_tools.py index 34dd430..41896cc 100644 --- a/tests/test_eval_no_tools.py +++ b/tests/test_eval_no_tools.py @@ -1557,10 +1557,13 @@ def test_run_no_tools_eval_writes_resume_metadata( assert metadata_path.exists() metadata = json.loads(metadata_path.read_text()) assert metadata["task"] == "eval_no_tools_batch" + assert metadata["generated_at_utc"] assert metadata["scenario_count"] == 1 assert metadata["programs"] == ["income_tax"] assert metadata["models"] == {"gpt-5.4": "gpt-5.4"} assert metadata["policyengine_bundles"]["us"]["model_package"] == "policyengine-us" + assert metadata["runtime_environment"]["python"]["version"] + assert metadata["runtime_environment"]["packages"]["litellm"] @patch("policybench.eval_no_tools.run_single_no_tools") diff --git a/tests/test_provenance.py b/tests/test_provenance.py new file mode 100644 index 0000000..371887a --- /dev/null +++ b/tests/test_provenance.py @@ -0,0 +1,34 @@ +"""Tests for benchmark runtime provenance helpers.""" + +from policybench.provenance import ( + dependency_lockfile_hashes, + installed_package_versions, + runtime_provenance, +) + + +def test_installed_package_versions_skips_missing_packages(): + versions = installed_package_versions(("definitely-not-installed-policybench",)) + + assert versions == {} + + +def test_runtime_provenance_includes_python_and_packages(): + provenance = runtime_provenance() + + assert provenance["python"]["version"] + assert provenance["python"]["implementation"] + assert provenance["python"]["executable"] + assert "litellm" in provenance["packages"] + assert "uv.lock" in provenance["lockfiles"] + + +def test_dependency_lockfile_hashes_hashes_existing_lockfiles(tmp_path): + lockfile = tmp_path / "uv.lock" + lockfile.write_text("lock contents", encoding="utf-8") + + hashes = dependency_lockfile_hashes(tmp_path) + + assert hashes == { + "uv.lock": "af45314a8a7cff86a2eb1073b95f9bc85fad1640809e031318555ce2f4bcf760" + }