Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions policybench/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -448,6 +448,7 @@ def main():
if args.command in {"reference-outputs", "ground-truth"}:
from policybench.ground_truth import calculate_ground_truth
from policybench.policyengine_runtime import runtime_metadata_for_country
from policybench.provenance import runtime_provenance
from policybench.scenarios import (
generate_scenarios,
get_uk_dataset_path,
Expand Down Expand Up @@ -488,6 +489,7 @@ def main():
"programs": sorted(programs),
"output": args.output,
"scenario_manifest_output": args.scenario_manifest_output,
"runtime_environment": runtime_provenance(),
**runtime_metadata_for_country(
args.country,
source_dataset_path=source_dataset_path,
Expand Down
4 changes: 4 additions & 0 deletions policybench/eval_no_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import signal
import threading
import time
from datetime import datetime, timezone
from pathlib import Path
from typing import Iterable

Expand All @@ -22,6 +23,7 @@
make_no_tools_batch_prompt,
make_no_tools_batch_repair_prompt,
)
from policybench.provenance import runtime_provenance
from policybench.scenarios import Scenario, scenario_to_dict
from policybench.spec import expand_programs_for_scenario

Expand Down Expand Up @@ -1419,13 +1421,15 @@ def _build_resume_metadata(
return {
"metadata_version": RESUME_METADATA_VERSION,
"task": task,
"generated_at_utc": datetime.now(timezone.utc).isoformat(),
"run_id": run_id,
"include_explanations": include_explanations,
"scenario_count": len(scenarios),
"scenario_hash": hashlib.sha256(scenario_signature.encode("utf-8")).hexdigest(),
"programs": sorted(programs),
"models": {name: models[name] for name in sorted(models)},
"policyengine_bundles": policyengine_bundles_for_countries(countries),
"runtime_environment": runtime_provenance(),
}


Expand Down
55 changes: 55 additions & 0 deletions policybench/provenance.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
"""Runtime provenance helpers for benchmark artifacts."""

from __future__ import annotations

import platform
import sys
from hashlib import sha256
from importlib import metadata
from pathlib import Path

PROVENANCE_PACKAGES = (
"litellm",
"numpy",
"pandas",
"policyengine",
"policyengine-us",
"policyengine-uk",
)


def installed_package_versions(
packages: tuple[str, ...] = PROVENANCE_PACKAGES,
) -> dict[str, str]:
"""Return installed package versions for packages relevant to a run."""
versions = {}
for package in packages:
try:
versions[package] = metadata.version(package)
except metadata.PackageNotFoundError:
continue
return versions


def runtime_provenance() -> dict:
"""Return serializable Python and dependency provenance."""
return {
"python": {
"version": platform.python_version(),
"implementation": platform.python_implementation(),
"executable": sys.executable,
},
"packages": installed_package_versions(),
"lockfiles": dependency_lockfile_hashes(),
}


def dependency_lockfile_hashes(root: Path | None = None) -> dict[str, str]:
"""Return hashes for dependency lockfiles committed with the repo."""
root = Path(__file__).resolve().parents[1] if root is None else Path(root)
lockfiles = {}
for filename in ("uv.lock",):
path = root / filename
if path.exists():
lockfiles[filename] = sha256(path.read_bytes()).hexdigest()
return lockfiles
3 changes: 3 additions & 0 deletions tests/test_eval_no_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -1557,10 +1557,13 @@ def test_run_no_tools_eval_writes_resume_metadata(
assert metadata_path.exists()
metadata = json.loads(metadata_path.read_text())
assert metadata["task"] == "eval_no_tools_batch"
assert metadata["generated_at_utc"]
assert metadata["scenario_count"] == 1
assert metadata["programs"] == ["income_tax"]
assert metadata["models"] == {"gpt-5.4": "gpt-5.4"}
assert metadata["policyengine_bundles"]["us"]["model_package"] == "policyengine-us"
assert metadata["runtime_environment"]["python"]["version"]
assert metadata["runtime_environment"]["packages"]["litellm"]


@patch("policybench.eval_no_tools.run_single_no_tools")
Expand Down
34 changes: 34 additions & 0 deletions tests/test_provenance.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
"""Tests for benchmark runtime provenance helpers."""

from policybench.provenance import (
dependency_lockfile_hashes,
installed_package_versions,
runtime_provenance,
)


def test_installed_package_versions_skips_missing_packages():
versions = installed_package_versions(("definitely-not-installed-policybench",))

assert versions == {}


def test_runtime_provenance_includes_python_and_packages():
provenance = runtime_provenance()

assert provenance["python"]["version"]
assert provenance["python"]["implementation"]
assert provenance["python"]["executable"]
assert "litellm" in provenance["packages"]
assert "uv.lock" in provenance["lockfiles"]


def test_dependency_lockfile_hashes_hashes_existing_lockfiles(tmp_path):
lockfile = tmp_path / "uv.lock"
lockfile.write_text("lock contents", encoding="utf-8")

hashes = dependency_lockfile_hashes(tmp_path)

assert hashes == {
"uv.lock": "af45314a8a7cff86a2eb1073b95f9bc85fad1640809e031318555ce2f4bcf760"
}