diff --git a/benchmarks/arteval_bench/.gitignore b/benchmarks/arteval_bench/.gitignore index 64b18d31..8de6bf48 100644 --- a/benchmarks/arteval_bench/.gitignore +++ b/benchmarks/arteval_bench/.gitignore @@ -117,3 +117,6 @@ a.out # Build directories build/ cmake-build-*/ + +# Duplicate task list copies (canonical: arteval_tasks.jsonl) +data/benchmark/arteval_tasks copy*.jsonl diff --git a/benchmarks/arteval_bench/README.md b/benchmarks/arteval_bench/README.md index bfc30f77..84cc1b78 100644 --- a/benchmarks/arteval_bench/README.md +++ b/benchmarks/arteval_bench/README.md @@ -182,5 +182,67 @@ The benchmark supports multiple AI agents: - **Claude Code**: Anthropic's code assistant - **Mini SWE Agent**: The compact version of [SWE-agent](https://github.com/SWE-agent) assistant - **OpenHands**: Open-source coding agent +- **ae_agent**: Claude Agent SDK–based agent (same logic as the standalone [artifact-agent](https://github.com/sys-intelligence/artifact-agent) repo), with full support for host/Docker, interactive mode, Skill, Sub-agent, per-task timeout, GPU, and optional container sync/commit/stop. To add your own agent to the benchmark, see [add_agents.md](add_agents.md). + +#### » ae_agent usage and options + +When using the **ae_agent** (`-a ae_agent` or `-a ae-agent`), you can pass the following from the command line and/or the task JSONL. + +**Command-line arguments** + +| Argument | Description | +|----------|-------------| +| `-i`, `--input_file` | Input JSONL file with tasks (default: `./data/benchmark/arteval_tasks.jsonl`). | +| `-o`, `--save_path` | Directory for results (default: `./outputs/ae__ae-agent_`). | +| `-a`, `--agent` | Agent name; use `ae_agent` or `ae-agent` for this agent. | +| `-m`, `--model_name` | Model name (e.g. `claude-sonnet-4-5-20250929`). | +| `--interactive` | After the task completes, keep a session open so you can give more instructions (requires a TTY). In Docker mode the runner is executed in the foreground via `docker exec -it`. | +| `--enable-skill` | Enable Claude Agent SDK Skill (load from `~/.claude/skills/` and `.claude/skills/`). | +| `--enable-subagent` | Enable Claude Agent SDK Sub-agent (Task tool). | + +**JSONL task fields (per line)** + +| Field | Description | +|-------|-------------| +| `artifact_id` | Unique task identifier. | +| `artifact_dir` | Artifact directory name (relative to the JSONL file’s directory). | +| `artifact_readme` | Path to the README or task description file (relative to artifact root). | +| `artifact_url` | Optional. Git clone URL; used when `artifact_dir` is missing or the path does not exist. | +| `env` | `"local"` for host; Docker image name (e.g. `bastoica/ae-agent-ubuntu24.04:latest`) for Docker. | +| `evaluator` | Command to run after the agent (e.g. `python _agent_eval/main.py`). | +| `expected_score` | Expected score for this artifact (default 4). | +| `timeout` | Optional. Per-task timeout in seconds or milliseconds (see utils: values < 86400 are seconds, else milliseconds). | +| `gpu` | Optional. When `true`, pass `--gpus all` to Docker (Docker mode only). | +| `interactive` | Optional. When `true`, enable interactive mode for this task (overrides CLI default). | +| `enable_skill` | Optional. When `true`, enable Skill for this task. | +| `enable_subagent` | Optional. When `true`, enable Sub-agent for this task. | +| `keep_container` | Optional. When `false` (default for ae_agent), after the run the workspace is synced from the container to the host, the container is committed as an image, and the container is stopped. When `true`, the container is left running for inspection. | + +**Examples** + +```sh +# Host mode, default options +python src/main.py -i ./data/benchmark/arteval_tasks.jsonl -a ae_agent -o ./outputs/run1 + +# With interactive mode (TTY required for Docker) +python src/main.py --interactive -i ./data/benchmark/arteval_tasks.jsonl -a ae_agent -o ./outputs/run2 + +# Enable Skill and Sub-agent +python src/main.py --enable-skill --enable-subagent -i ./data/benchmark/arteval_tasks.jsonl -a ae_agent -o ./outputs/run3 +``` + +**Outputs (when using ae_agent)** + +Results are written under the given `save_path`: + +- `result.jsonl` — One JSON object per task (task_id, status, score, agent_run_results, etc.). +- `avg_score.json` — Benchmark summary (final_score, total_tasks). +- `ae_report_.md` — Per-task report (status, project path, log file, agent summary, and optional Docker image instructions). +- `summary.json` — Total and successful task counts and success rate (same format as standalone artifact-agent). +- When running via the benchmark entry, log paths and agent summary are filled from available data; standalone `python -m ae_agent.main` also produces `ae_log_.log`. + +**Docker + interactive** + +For Docker tasks with `interactive: true` (or `--interactive`), the benchmark runs the agent in the foreground via `docker exec -it` so you can interact in the same terminal. This requires a real TTY (e.g. running `python src/main.py ...` in a terminal, not under CI or with redirected stdin). If stdin is not a TTY, the run falls back to non-interactive (background runner) and a warning is logged. diff --git a/benchmarks/arteval_bench/data/benchmark/ae_agent_smoke/README.md b/benchmarks/arteval_bench/data/benchmark/ae_agent_smoke/README.md new file mode 100644 index 00000000..c6b0c758 --- /dev/null +++ b/benchmarks/arteval_bench/data/benchmark/ae_agent_smoke/README.md @@ -0,0 +1,13 @@ +# AE Agent Smoke Test Artifact + +Minimal task for quick testing of ae_agent (host/docker + evaluation). Should complete in under a minute. + +## Task + +1. In this directory (the artifact root), create a file named **success.txt**. +2. The file must contain exactly the single character **1** (no newline required). +3. No other steps are required. + +Example (bash): `echo -n 1 > success.txt` + +After you finish, the benchmark will run an evaluation script that checks for this file and outputs a score (1 if correct, 0 otherwise). diff --git a/benchmarks/arteval_bench/data/benchmark/ae_agent_smoke/README_SMOKE_TEST.md b/benchmarks/arteval_bench/data/benchmark/ae_agent_smoke/README_SMOKE_TEST.md new file mode 100644 index 00000000..bab00be4 --- /dev/null +++ b/benchmarks/arteval_bench/data/benchmark/ae_agent_smoke/README_SMOKE_TEST.md @@ -0,0 +1,44 @@ +# AE Agent smoke test + +## Purpose + +- Test the agent under `src/agents/ae_agent`: **host** and **docker** modes, and the **evaluation script** flow (evaluator runs after the agent and parses score). +- Task is minimal (create `success.txt` with content `1` in the artifact root); finishes in a few minutes and avoids long runs with full arteval_tasks. + +## Files + +- **ae_agent_smoke/**: Minimal artifact + - `README.md`: Task description (create success.txt with content 1) + - `_agent_eval/check.py`: Evaluator; outputs `1` if success.txt exists and contains `1`, else `0` +- **ae_agent_smoke_test.jsonl**: Two lines + - First line: `run_on_host: true`, run ae_agent + evaluator on host + - Second line: `run_on_host: false`, run ae_agent + evaluator in Docker + +## How to run + +From the **benchmarks/arteval_bench** directory: + +```bash +# Set ANTHROPIC_API_KEY or ANTHROPIC_FOUNDRY_API_KEY first +python src/main.py \ + -i ./data/benchmark/ae_agent_smoke_test.jsonl \ + -a ae_agent \ + -m claude-sonnet-4-5-20250929 \ + -o ./outputs/ae_agent_smoke_$(date +%Y%m%d_%H%M%S) +``` + +- **Host task**: Runs the agent on the host, then runs `python3 _agent_eval/check.py` on the host to get the score. +- **Docker task**: Runs the agent in the container, then runs the evaluator in the container to get the score; the container is kept running by default for debugging. + +Results are under the `-o` directory: `result.jsonl` (one JSON object per line with `score`, `status`, `test_method`, etc.) and `avg_score.json`. + +## Interactive mode + +The benchmark’s `src/main.py` does not read an `interactive` field from the JSONL, so the command above only covers **non-interactive** runs. To test interactive mode: + +- Use ae_agent’s main entry with `--interactive`, and set `"env": "local"` or `"run_on_host": true` / `"env": "docker"` in the JSONL for the task, for example: + ```bash + cd src/agents/ae_agent + python -m ae_agent.main --interactive -i ../../../data/benchmark/ae_agent_smoke_test.jsonl -o ../../../outputs/ae_agent_smoke_int + ``` +- In interactive mode, after the first task completes you can keep typing instructions; type `quit` or `exit` to end. diff --git a/benchmarks/arteval_bench/data/benchmark/ae_agent_smoke/_agent_eval/check.py b/benchmarks/arteval_bench/data/benchmark/ae_agent_smoke/_agent_eval/check.py new file mode 100644 index 00000000..e0d7c479 --- /dev/null +++ b/benchmarks/arteval_bench/data/benchmark/ae_agent_smoke/_agent_eval/check.py @@ -0,0 +1,22 @@ +#!/usr/bin/env python3 +"""Minimal evaluator for ae_agent_smoke: output 1 if success.txt exists and contains '1', else 0. + +Output must be a single digit on a line (or last line) for benchmark score parsing. +""" +import os +import sys + +def main(): + root = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + path = os.path.join(root, "success.txt") + if os.path.isfile(path): + with open(path, "r") as f: + content = f.read().strip() + if content == "1": + print(1) + sys.exit(0) + print(0) + sys.exit(0) + +if __name__ == "__main__": + main() diff --git a/benchmarks/arteval_bench/data/benchmark/ae_agent_smoke_test.jsonl b/benchmarks/arteval_bench/data/benchmark/ae_agent_smoke_test.jsonl new file mode 100644 index 00000000..de29995c --- /dev/null +++ b/benchmarks/arteval_bench/data/benchmark/ae_agent_smoke_test.jsonl @@ -0,0 +1,2 @@ +{"artifact_id": "ae_agent_smoke_host", "artifact_dir": "ae_agent_smoke", "artifact_readme": "ae_agent_smoke/README.md", "evaluator": "python3 _agent_eval/check.py", "expected_score": 1, "env": "local"} +{"artifact_id": "ae_agent_smoke_docker", "artifact_dir": "ae_agent_smoke", "artifact_readme": "ae_agent_smoke/README.md", "evaluator": "python3 _agent_eval/check.py", "expected_score": 1, "env": "bastoica/ae-agent-ubuntu24.04:latest", "timeout": 120000} diff --git a/benchmarks/arteval_bench/data/benchmark/arteval_tasks.jsonl b/benchmarks/arteval_bench/data/benchmark/arteval_tasks.jsonl index 1f46440a..8928ead5 100644 --- a/benchmarks/arteval_bench/data/benchmark/arteval_tasks.jsonl +++ b/benchmarks/arteval_bench/data/benchmark/arteval_tasks.jsonl @@ -1,6 +1,6 @@ -{"artifact_id": "sosp24_wasabi", "artifact_dir": "sosp24_wasabi", "artifact_readme": "sosp24_wasabi/wasabi/README.md", "artifact_url": "https://github.com/bastoica/wasabi/tree/sosp24-ae", "evaluator": "sosp24_wasabi/wasabi/_agent_eval/main.py", "expected_score": 4, "docer_env": "bastoica/ae-agent-ubuntu24.04:latest"} -{"artifact_id": "osdi24_anvil", "artifact_dir": "osdi24_anvil", "artifact_readme": "osdi24_anvil/anvil/README.md", "artifact_url": "https://github.com/anvil-verifier/anvil", "evaluator": "osdi24_anvil/_agent_eval/main.py", "expected_score": 4, "docer_env": "bastoica/ae-agent-ubuntu24.04:latest"} -{"artifact_id": "sosp23_acto", "artifact_dir": "sosp23_acto", "artifact_readme": "sosp23_acto/acto/README.md", "artifact_url": "https://github.com/xlab-uiuc/acto", "evaluator": "sosp23_acto/_agent_eval/main.py", "expected_score": 4, "docer_env": "bastoica/ae-agent-ubuntu24.04:latest"} -{"artifact_id": "eurosys25_egwalker", "artifact_dir": "eurosys25_egwalker", "artifact_readme": "eurosys25_egwalker/egwalker/README.md", "artifact_url": "https://github.com/josephg/egwalker-paper", "evaluator": "eurosys25_egwalker/_agent_eval/main.py", "expected_score": 4, "docer_env": "bastoica/ae-agent-ubuntu24.04:latest"} -{"artifact_id": "eurosys25_depsurf", "artifact_dir": "eurosys25_depsurf", "artifact_readme": "eurosys25_depsurf/depsurf/README.md", "artifact_url": "https://github.com/ShawnZhong/DepSurf", "evaluator": "eurosys25_depsurf/_agent_eval/main.py", "expected_score": 4, "docer_env": "bastoica/ae-agent-ubuntu24.04:latest"} -{"artifact_id": "osdi24_eet", "artifact_dir": "osdi24_eet", "artifact_readme": "osdi24_eet/eet/README.md", "artifact_url": "https://github.com/JZuming/EET", "evaluator": "osdi24_eet/_agent_eval/main.py", "expected_score": 4, "docer_env": "bastoica/ae-agent-ubuntu24.04:latest"} \ No newline at end of file +{"artifact_id": "sosp24_wasabi", "artifact_dir": "sosp24_wasabi", "artifact_readme": "sosp24_wasabi/wasabi/README.md", "artifact_url": "https://github.com/bastoica/wasabi/tree/sosp24-ae", "evaluator": "sosp24_wasabi/wasabi/_agent_eval/main.py", "expected_score": 4, "env": "bastoica/ae-agent-ubuntu24.04:latest"} +{"artifact_id": "osdi24_anvil", "artifact_dir": "osdi24_anvil", "artifact_readme": "osdi24_anvil/anvil/README.md", "artifact_url": "https://github.com/anvil-verifier/anvil", "evaluator": "osdi24_anvil/_agent_eval/main.py", "expected_score": 4, "env": "bastoica/ae-agent-ubuntu24.04:latest"} +{"artifact_id": "sosp23_acto", "artifact_dir": "sosp23_acto", "artifact_readme": "sosp23_acto/acto/README.md", "artifact_url": "https://github.com/xlab-uiuc/acto", "evaluator": "sosp23_acto/_agent_eval/main.py", "expected_score": 4, "env": "bastoica/ae-agent-ubuntu24.04:latest"} +{"artifact_id": "eurosys25_egwalker", "artifact_dir": "eurosys25_egwalker", "artifact_readme": "eurosys25_egwalker/egwalker/README.md", "artifact_url": "https://github.com/josephg/egwalker-paper", "evaluator": "eurosys25_egwalker/_agent_eval/main.py", "expected_score": 4, "env": "bastoica/ae-agent-ubuntu24.04:latest"} +{"artifact_id": "eurosys25_depsurf", "artifact_dir": "eurosys25_depsurf", "artifact_readme": "eurosys25_depsurf/depsurf/README.md", "artifact_url": "https://github.com/ShawnZhong/DepSurf", "evaluator": "eurosys25_depsurf/_agent_eval/main.py", "expected_score": 4, "env": "bastoica/ae-agent-ubuntu24.04:latest"} +{"artifact_id": "osdi24_eet", "artifact_dir": "osdi24_eet", "artifact_readme": "osdi24_eet/eet/README.md", "artifact_url": "https://github.com/JZuming/EET", "evaluator": "osdi24_eet/_agent_eval/main.py", "expected_score": 4, "env": "bastoica/ae-agent-ubuntu24.04:latest"} diff --git a/benchmarks/arteval_bench/env.toml b/benchmarks/arteval_bench/env.toml index eac33edd..564e06ab 100644 --- a/benchmarks/arteval_bench/env.toml +++ b/benchmarks/arteval_bench/env.toml @@ -2,7 +2,7 @@ AZURE_API_KEY = "XXX" AZURE_API_BASE = "XXXX" AZURE_API_VERSION = "XXX" -ANTHROPIC_API_KEY = "sk-XXXX" +ANTHROPIC_API_KEY = "YOUR_ANTHROPIC_API_KEY" [hardware] use_gpu = false diff --git a/benchmarks/arteval_bench/run_ae_agent_smoke_test.sh b/benchmarks/arteval_bench/run_ae_agent_smoke_test.sh new file mode 100755 index 00000000..dba27087 --- /dev/null +++ b/benchmarks/arteval_bench/run_ae_agent_smoke_test.sh @@ -0,0 +1,21 @@ +#!/bin/bash +# Run ae_agent smoke test under arteval_bench (host + docker, with evaluation). +# Usage: ./run_ae_agent_smoke_test.sh [model_name] +# Default model: claude-sonnet-4-5-20250929 + +set -e +BENCH_ROOT="$(cd "$(dirname "$0")" && pwd)" +cd "$BENCH_ROOT" +MODEL="${1:-claude-sonnet-4-5-20250929}" +OUT_DIR="./outputs/ae_agent_smoke_$(date +%Y%m%d_%H%M%S)" +echo "==> AE Agent smoke test (host + docker + evaluation)" +echo " Model: $MODEL" +echo " Output: $OUT_DIR" +echo "" +python src/main.py \ + -i ./data/benchmark/ae_agent_smoke_test.jsonl \ + -a ae_agent \ + -m "$MODEL" \ + -o "$OUT_DIR" +echo "" +echo "==> Done. Results: $OUT_DIR/result.jsonl and $OUT_DIR/avg_score.json" diff --git a/benchmarks/arteval_bench/src/agents/ae_agent/README.md b/benchmarks/arteval_bench/src/agents/ae_agent/README.md new file mode 100644 index 00000000..f630da1d --- /dev/null +++ b/benchmarks/arteval_bench/src/agents/ae_agent/README.md @@ -0,0 +1,62 @@ +# AE Agent (ArtEval sub-agent) + +This agent is the **ae_agent** for the system-intelligence-benchmark ArtEval benchmark, with the same logic as the standalone [ae-agent](https://github.com/Couen/ae-agent) repo. It runs inside the benchmark container using the Claude Agent SDK to execute artifact evaluation tasks. + +## Files + +- **install.sh**: Installs `claude-agent-sdk` inside the container for use by runner.py. +- **runner.sh**: Entry script; invoked as `runner.sh `. Uses `/agent/current_task.txt` when the benchmark passes the task via file. +- **runner.py**: Runs the task with Claude Agent SDK; supports 429 rate-limit retry; second argument can be task text or path to a task file. Artifact path in container is `/repo`. +- **run_eval.py**: Single-task orchestration: `env='local'` runs on host, otherwise runs in Docker (requires swerex/swe-rex). +- **main.py**: CLI entry for batch runs from JSONL; supports host or Docker per task. +- **utils.py**: Timeout, task/path helpers, Tee, reports, summary (used by runner, main, run_eval). +- **__init__.py**: Package marker. + +## Usage from the benchmark + +From the benchmark root (`benchmarks/arteval_bench/`): + +```bash +python src/main.py -i ./data/benchmark/arteval_tasks.jsonl -a ae_agent -m claude-sonnet-4-5-20250929 -o ./outputs/ae_agent_run +``` + +You can also use `-a ae-agent`; it is equivalent to `ae_agent`. + +The benchmark will: + +1. Upload this agent to `/agent` in the container. +2. For ae_agent: write the task to `/agent/current_task.txt`, then run `runner.sh "$model" /agent/current_task.txt` (avoids shell quoting issues with large tasks). +3. Use long-running and live-log behavior (48h timeout, streamed logs, remove `_agent_eval` before run and re-upload before evaluation, container kept for debugging). +4. **Evaluation script flow** (same as claude_sdk): after the agent finishes, run the JSONL `evaluator` (test_method), e.g. `cd /repo && python _agent_eval/main.py`, parse output for `score` and write to result. +5. If set, pass through `ANTHROPIC_API_KEY`, `ANTHROPIC_FOUNDRY_API_KEY`, `ANTHROPIC_FOUNDRY_BASE_URL`, `CLAUDE_CODE_USE_FOUNDRY`. + +**Evaluation flow on host**: When `run_on_host=True` and the agent is ae_agent, `run_eval_in_env.run_eval_on_host` calls this package's `run_agent_then_eval()`: run the agent first, then run `test_method` on the host (e.g. `cd project_path && python _agent_eval/main.py`), parse score with `utils.parse_eval_score()`, and return a result with the same shape as the Docker path (`score`, `test_method`, `status`). + +## Dependencies + +- Python 3; `claude-agent-sdk` is installed in the container via `install.sh`. +- When running in Docker via the benchmark's `run_eval_in_env.py`, install `swerex` on the host (the benchmark includes it). When using this directory's `main.py` for Docker mode standalone, you also need `swe-rex`. + +## Running on host (local) + +You can run tasks on the **host** from this directory (without the benchmark's Docker flow): + +1. **Single or batch via main.py** + Use a JSONL where each line can set `"env": "local"` or `"run_on_host": true` to run that task on the host; others run in Docker (requires swerex). + + ```bash + cd benchmarks/arteval_bench/src/agents/ae_agent + python -m ae_agent.main -i /path/to/tasks.jsonl -a ae_agent -m claude-sonnet-4-5-20250929 -o ./outputs/host_run + ``` + +2. **Host mode requirements** + - Set `ANTHROPIC_API_KEY` or `ANTHROPIC_FOUNDRY_API_KEY` + - Docker installed and running (for prereq check; agent runs on host) + - `pip install claude-agent-sdk` + +3. **Docker mode from this directory** + If the JSONL has `"env": "docker"` (or `run_on_host` is not set), `main.py` runs that task in Docker via `run_eval.py` (requires `swe-rex`/`swerex`). + +## Relation to the standalone ae-agent repo + +The standalone ae-agent repo provides the same host/Docker CLI. This sub-agent includes both the **in-container** runner (used by the benchmark's `run_eval_in_env.py`) and **host/local** mode via `main.py` and `run_eval.py`. diff --git a/benchmarks/arteval_bench/src/agents/ae_agent/__init__.py b/benchmarks/arteval_bench/src/agents/ae_agent/__init__.py new file mode 100644 index 00000000..ca489f55 --- /dev/null +++ b/benchmarks/arteval_bench/src/agents/ae_agent/__init__.py @@ -0,0 +1,23 @@ +"""AE Agent - A tool for running Claude Agent SDK on artifact evaluation tasks. + +Output files (under save_path): +- ae_report_.md: Per-artifact report with status and agent summary +- ae_log_.log: Per-artifact execution log +- result.jsonl: Per-task results (one JSON per line) +- summary.json: Overall statistics +""" + +from .main import cli_main, main +from .run_eval import run_agent_then_eval, run_eval +from .runner import build_system_prompt, run_agent +from .utils import parse_eval_score + +__all__ = [ + 'build_system_prompt', + 'cli_main', + 'main', + 'parse_eval_score', + 'run_agent', + 'run_agent_then_eval', + 'run_eval', +] diff --git a/benchmarks/arteval_bench/src/agents/ae_agent/install.sh b/benchmarks/arteval_bench/src/agents/ae_agent/install.sh new file mode 100644 index 00000000..829de33d --- /dev/null +++ b/benchmarks/arteval_bench/src/agents/ae_agent/install.sh @@ -0,0 +1,12 @@ +#!/bin/bash +# Setup agent running environment inside Docker container. +# Ensures claude-agent-sdk is available so runner.py can import claude_agent_sdk. +set -e +if ! python3 -c "import claude_agent_sdk" 2>/dev/null; then + echo "Installing claude-agent-sdk..." + pip3 install claude-agent-sdk==0.1.24 || pip3 install --break-system-packages claude-agent-sdk==0.1.24 || true + if ! python3 -c "import claude_agent_sdk"; then + echo "WARNING: claude_agent_sdk still not importable; runner may fail." + fi +fi +echo "Agent environment ready." diff --git a/benchmarks/arteval_bench/src/agents/ae_agent/main.py b/benchmarks/arteval_bench/src/agents/ae_agent/main.py new file mode 100644 index 00000000..ac39a3a7 --- /dev/null +++ b/benchmarks/arteval_bench/src/agents/ae_agent/main.py @@ -0,0 +1,307 @@ +"""Main entry point for running artifact tasks. + +Supports both: +- Run from this directory: env=local (host) or env=docker per task in JSONL. +- Used as in-container runner when benchmark (arteval_bench) uploads this agent to /agent. +""" + +from __future__ import annotations + +import argparse +import json +import logging +import os +import sys +from dataclasses import dataclass +from datetime import datetime + +from .run_eval import make_error_result, run_eval +from .utils import ( + AGENT_SUMMARY_FALLBACK_MAX, + DEFAULT_MODEL, + LOG_OUTPUT_TRUNCATE_BYTES, + SUMMARY_BASENAME_TEMPLATE, + SUMMARY_INSTRUCTION, + Tee, + compute_and_write_summary, + docker_image_from_item, + enable_skill_from_item, + enable_subagent_from_item, + env_from_item, + get_task, + gpu_from_item, + interactive_from_item, + read_task_from_file, + resolve_project_path, + safe_task_id, + timeout_ms_from_item, + write_task_report, +) + + +def _build_task_with_summary(task: str, safe_id: str) -> tuple[str, str]: + """Append summary instruction to task. Returns (task, summary_basename).""" + summary_basename = SUMMARY_BASENAME_TEMPLATE.format(safe_id=safe_id) + full_task = task.rstrip() + SUMMARY_INSTRUCTION.format(basename=summary_basename) + return full_task, summary_basename + + +def _persist_result(save_path: str, result: dict, log_path: str) -> None: + """Write result to result.jsonl and append run output to log.""" + with open(f'{save_path}/result.jsonl', 'a+', encoding='utf-8') as fw: + fw.write(json.dumps(result, ensure_ascii=False) + '\n') + with open(log_path, 'a', encoding='utf-8') as lf: + lf.write(f'\nTask finished at {result["timestamp"]}, status: {result.get("status", "unknown")}\n') + lf.write('\n--- Agent run output ---\n') + run_out = str(result.get('agent_run_results', '')) + lf.write(run_out[:LOG_OUTPUT_TRUNCATE_BYTES]) + if len(run_out) > LOG_OUTPUT_TRUNCATE_BYTES: + lf.write('\n... (truncated)\n') + + +def _gather_agent_summary(project_path: str, summary_basename: str, result: dict) -> str: + """Read agent summary file or fallback to truncated run output.""" + summary_file = os.path.join(project_path, summary_basename) + if os.path.isfile(summary_file): + try: + with open(summary_file, encoding='utf-8') as f: + return f.read() + except OSError as e: + logging.warning('Failed to read summary file %s: %s', summary_file, e) + fallback = str(result.get('agent_run_results', ''))[:AGENT_SUMMARY_FALLBACK_MAX] + return fallback or '(No summary captured)' + + +def _persist_skipped(save_path: str, task_id: str, message: str) -> None: + """Append one result line for a skipped task so summary total is accurate.""" + result = { + 'task_id': task_id, + 'status': 'skipped', + 'message': message, + 'timestamp': datetime.now().isoformat(), + } + with open(f'{save_path}/result.jsonl', 'a+', encoding='utf-8') as fw: + fw.write(json.dumps(result, ensure_ascii=False) + '\n') + + +def _run_single_task( + item: dict, + model: str, + agent: str, + save_path: str, + input_file: str, + interactive_default: bool, + enable_skill_default: bool = False, + enable_subagent_default: bool = False, +) -> None: + """Process a single JSONL task: parse, run, write results and report.""" + env = env_from_item(item) + docker_image = docker_image_from_item(item, env=env) + use_gpu = gpu_from_item(item) + interactive = interactive_from_item(item) or interactive_default + enable_skill = enable_skill_from_item(item, enable_skill_default) + enable_subagent = enable_subagent_from_item(item, enable_subagent_default) + task_file = item.get('artifact_readme', None) + task_id = item.get('artifact_id', None) + timeout_ms = timeout_ms_from_item(item) + safe_id = safe_task_id(task_id) + + project_path, path_error = resolve_project_path(item, input_file, save_path) + if path_error: + print(path_error) + _persist_skipped(save_path, task_id or safe_id, path_error) + return + print(f'Project path: {project_path}') + + raw_task = read_task_from_file(project_path, task_file) if task_file else get_task('README.md') + task, summary_basename = _build_task_with_summary(raw_task, safe_id) + + task_file_path = os.path.join(save_path, f'current_task_{safe_id}.txt') + with open(task_file_path, 'w', encoding='utf-8') as f: + f.write(task) + + timeout_str = str(timeout_ms) if timeout_ms is not None else 'default' + print(f'Task {task_id}: env={env}, timeout_ms={timeout_str}, gpu={use_gpu}, interactive={interactive}, enable_skill={enable_skill}, enable_subagent={enable_subagent}') + + log_path = os.path.join(save_path, f'ae_log_{safe_id}.log') + with open(log_path, 'w', encoding='utf-8') as lf: + lf.write(f'Task {task_id} started at {datetime.now().isoformat()}\n') + lf.write(f'Project path: {project_path}\n') + lf.write(f'Env: {env}\n\n') + + # Run task (stdout/stderr teed to log), then persist result and report. + # Note: For env='local', agent_path is ignored; the in-process runner (this package) is used. + old_stdout, old_stderr = sys.stdout, sys.stderr + try: + with Tee(sys.stdout, log_path) as tee_out: + with Tee(sys.stderr, log_path) as tee_err: + sys.stdout, sys.stderr = tee_out, tee_err + result = run_eval( + env=env, + docker_image=docker_image, + project_path=project_path, + task_id=task_id, + task=task, + task_file_path=task_file_path, + model=model, + agent_path=agent, + save_path=save_path, + timeout_ms=timeout_ms, + use_gpu=use_gpu, + interactive=interactive, + enable_skill=enable_skill, + enable_subagent=enable_subagent, + ) + except Exception as e: + sys.stdout, sys.stderr = old_stdout, old_stderr + logging.exception('run_eval failed for task %s: %s', task_id, e) + result = make_error_result(task_id, task, project_path, str(e), env) + finally: + sys.stdout, sys.stderr = old_stdout, old_stderr + + result['timestamp'] = datetime.now().isoformat() + result['log_file'] = log_path + _persist_result(save_path, result, log_path) + + agent_summary = _gather_agent_summary(project_path, summary_basename, result) + write_task_report(save_path, safe_id, task_id, result, log_path, agent_summary) + print(f'Task {task_id} completed. Status: {result.get("status", "unknown")}') + + +def main(input_file, model, agent, save_path, interactive_default: bool = False, enable_skill_default: bool = False, enable_subagent_default: bool = False): + """Main function for running tasks.""" + if not os.path.isfile(input_file): + logging.error('Input file not found: %s', input_file) + sys.exit(1) + + print(f'Using model: {model}, agent: {agent}') + + with open(input_file, encoding='utf-8') as f: + for line_no, line in enumerate(f, start=1): + if not line.strip(): + continue + try: + item = json.loads(line) + except json.JSONDecodeError as e: + print(f'Skipping invalid JSON at line {line_no}: {e}') + _persist_skipped(save_path, f'line_{line_no}', f'Invalid JSON: {e}') + continue + + _run_single_task( + item=item, + model=model, + agent=agent, + save_path=save_path, + input_file=input_file, + interactive_default=interactive_default, + enable_skill_default=enable_skill_default, + enable_subagent_default=enable_subagent_default, + ) + + total_count, success_count = compute_and_write_summary(save_path) + print(f'All tasks completed: {success_count}/{total_count} succeeded.') + + +@dataclass +class _ResolvedConfig: + """Resolved CLI configuration ready for main().""" + + input_file: str + model: str + agent: str + save_path: str + interactive_default: bool + enable_skill_default: bool + enable_subagent_default: bool + + +def _parse_args() -> argparse.Namespace: + """Parse command-line arguments.""" + parser = argparse.ArgumentParser(description='AE Agent - Run Claude Agent SDK on artifact tasks') + parser.add_argument( + '-i', + '--input_file', + help='Input JSONL file with tasks', + default='./data/benchmark/arteval_tasks.jsonl', + ) + parser.add_argument('-o', '--save_path', help='Result save path', default=None) + parser.add_argument( + '-a', + '--agent', + help='Agent name (default: ae-agent)', + default='ae-agent', + ) + parser.add_argument( + '-m', + '--model_name', + help='Model Name', + default=DEFAULT_MODEL, + ) + parser.add_argument( + '--interactive', + action='store_true', + help='Enable interactive mode (continue giving agent instructions after task completes)', + ) + parser.add_argument( + '--enable-skill', + action='store_true', + help='Enable Claude Agent SDK Skill (load from ~/.claude/skills/ and .claude/skills/)', + ) + parser.add_argument( + '--enable-subagent', + action='store_true', + help='Enable Claude Agent SDK Sub-agent (Task tool)', + ) + return parser.parse_args() + + +def _resolve_paths(args: argparse.Namespace) -> _ResolvedConfig: + """Resolve paths and agent from parsed args.""" + model_name = args.model_name + agent = args.agent + input_file = args.input_file + save_path = args.save_path + + if save_path is None: + str_model_name = model_name.replace('/', '_').lower() + timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S') + save_path = os.path.join('./outputs', f'ae_{str_model_name}_ae-agent_{timestamp}') + + # When running from this directory (standalone or as arteval_bench agent), use script dir as agent path + if agent in ('ae-agent', 'ae_agent', 'claude_sdk'): + agent = os.path.dirname(os.path.abspath(__file__)) + + save_path = os.path.abspath(os.path.expanduser(save_path)) + os.makedirs(save_path, exist_ok=True) + + return _ResolvedConfig( + input_file=input_file, + model=model_name, + agent=agent, + save_path=save_path, + interactive_default=getattr(args, 'interactive', False), + enable_skill_default=getattr(args, 'enable_skill', False), + enable_subagent_default=getattr(args, 'enable_subagent', False), + ) + + +def cli_main(): + """CLI entry point.""" + args = _parse_args() + config = _resolve_paths(args) + print(f'Input file: {config.input_file}') + print(f'Save path: {config.save_path}') + print(f'Agent path: {config.agent}') + main( + config.input_file, + config.model, + config.agent, + config.save_path, + interactive_default=config.interactive_default, + enable_skill_default=config.enable_skill_default, + enable_subagent_default=config.enable_subagent_default, + ) + + +if __name__ == '__main__': + cli_main() diff --git a/benchmarks/arteval_bench/src/agents/ae_agent/run_eval.py b/benchmarks/arteval_bench/src/agents/ae_agent/run_eval.py new file mode 100644 index 00000000..75be027c --- /dev/null +++ b/benchmarks/arteval_bench/src/agents/ae_agent/run_eval.py @@ -0,0 +1,1015 @@ +"""Orchestration for executing artifact tasks in Docker or on host. + +Single entry point: run_eval(env, project_path, task_id, ...). +- env='local' -> _run_local() -> runner.run_agent() directly on host +- env != 'local' -> _run_in_docker() -> runner.py executed inside container +""" + +from __future__ import annotations + +import asyncio +import json +import logging +import os +import shutil +import subprocess +import sys +import tempfile +import time +from dataclasses import dataclass +from pathlib import Path + +from .runner import run_agent +from .utils import ( + DEFAULT_DOCKER_IMAGE, + apply_timeout_env, + has_api_key, + is_local_env, + parse_eval_score, + resolve_timeout_ms, + safe_task_id, + status_from_exit_code, + timeout_env_dict, +) + +SWEREX_AVAILABLE = False + + +def _import_swerex(): + """Try importing swerex under both package names (swerex and swe_rex). + + The package was renamed; we support both for backward compatibility. + Returns (DockerDeploymentConfig, BashAction, CreateBashSessionRequest, UploadRequest) + or raises ImportError. + """ + for pkg in ('swerex', 'swe_rex'): + try: + mod_docker = __import__(f'{pkg}.deployment.docker', fromlist=['DockerDeploymentConfig']) + mod_runtime = __import__( + f'{pkg}.runtime.abstract', fromlist=['BashAction', 'CreateBashSessionRequest', 'UploadRequest'] + ) + return ( + mod_docker.DockerDeploymentConfig, + mod_runtime.BashAction, + mod_runtime.CreateBashSessionRequest, + mod_runtime.UploadRequest, + ) + except ImportError: + continue + raise ImportError("Neither 'swerex' nor 'swe_rex' is installed") + + +try: + DockerDeploymentConfig, BashAction, CreateBashSessionRequest, UploadRequest = _import_swerex() + SWEREX_AVAILABLE = True +except ImportError: + logging.warning('swerex/swe-rex not available. Docker mode will not work.') + + +# Progress log every 5 minutes when runner is still running. +_PROGRESS_LOG_INTERVAL_SEC = 300 + +# Poll interval for checking runner status. +_POLL_INTERVAL_SEC = 10.0 + + +@dataclass +class _RunnerResult: + """Result from a Docker runner process.""" + + exit_code: int + output: str + + +def _make_eval_result( + task_id: str, + task: str, + project_path: str, + agent_output: str, + status: str, + run_on_host: bool, + *, + container_id: str | None = None, + saved_image: str | None = None, + container_stopped: bool = False, + message_count: int | None = None, + score: int | None = None, + test_method: str | None = None, +) -> dict: + """Build unified eval result dict for both host and Docker modes.""" + result = { + 'task_id': task_id, + 'task': task, + 'project_path': project_path, + 'agent_run_results': agent_output, + 'status': status, + 'run_on_host': run_on_host, + 'container_id': container_id, + 'saved_image': saved_image, + 'container_stopped': container_stopped, + } + if message_count is not None: + result['message_count'] = message_count + if score is not None: + result['score'] = score + if test_method is not None: + result['test_method'] = test_method + return result + + +def make_error_result( + task_id: str, + task: str, + project_path: str, + error_message: str, + env: str, +) -> dict: + """Build result dict for run_eval failure (exception/timeout). Same shape as normal result.""" + return _make_eval_result( + task_id, + task, + project_path, + error_message, + 'error', + is_local_env(env), + ) + + +# --------------------------------------------------------------------------- +# Host mode +# --------------------------------------------------------------------------- + + +def _check_host_prerequisites() -> bool: + """Check that docker, python, and API key are available on the host.""" + if not shutil.which('docker'): + logging.error('Docker is not installed on host.') + return False + if subprocess.run(['docker', 'ps'], capture_output=True, timeout=10).returncode != 0: + logging.error('Docker is not running on host.') + return False + if not has_api_key(): + logging.error('Neither ANTHROPIC_API_KEY nor ANTHROPIC_FOUNDRY_API_KEY is set.') + return False + return True + + +def _write_claude_settings(timeout_ms: int): + """Write ~/.claude/settings.json with timeout configuration.""" + claude_dir = Path.home() / '.claude' + claude_dir.mkdir(exist_ok=True) + settings = {'env': timeout_env_dict(timeout_ms)} + with open(claude_dir / 'settings.json', 'w', encoding='utf-8') as f: + json.dump(settings, f, indent=2) + + +async def _run_local( + project_path, + task_id, + task, + model, + timeout_ms: int, + *, + skip_prereq_check: bool = False, + interactive: bool = False, + enable_skill: bool = False, + enable_subagent: bool = False, +): + """Run one task on host by delegating to runner.run_agent().""" + print('=' * 80) + print('Running task on HOST MACHINE') + print('=' * 80) + + if not skip_prereq_check and not _check_host_prerequisites(): + raise RuntimeError('Host prerequisites check failed') + + _write_claude_settings(timeout_ms) + # run_eval() already calls apply_timeout_env() for local; no need to duplicate here. + + project_path = os.path.abspath(project_path) + if not os.path.isdir(project_path): + raise RuntimeError(f'Project path does not exist: {project_path}') + + print(f'Project path: {project_path}') + print(f'Task ID: {task_id}') + print(f'Model: {model}') + + agent_result = await run_agent( + model, + task, + env='local', + artifact_path=project_path, + timeout_ms=timeout_ms, + interactive=interactive, + enable_skill=enable_skill, + enable_subagent=enable_subagent, + ) + + return _make_eval_result( + task_id, + task, + project_path, + agent_result['output'], + status_from_exit_code(agent_result['exit_code']), + run_on_host=True, + message_count=agent_result['message_count'], + ) + + +# --------------------------------------------------------------------------- +# Benchmark flow: run agent then evaluation script (same as claude_sdk in arteval_bench) +# --------------------------------------------------------------------------- + +# Default timeout for running the evaluation script (e.g. pytest or oracle script) on host. +_EVAL_SCRIPT_TIMEOUT_SEC = 600 + + +async def _run_agent_then_eval_async( + project_path: str, + task_id: str, + task: str, + model: str, + test_method: str | None, + save_path: str, + timeout_ms: int | None = None, + *, + skip_prereq_check: bool = False, + interactive: bool = False, + enable_skill: bool = False, + enable_subagent: bool = False, +) -> dict: + """Run agent on host, then run evaluation script (test_method); return result with score. + + Used by arteval_bench when run_on_host=True and agent is ae_agent. Same flow as claude_sdk: + agent run → run test_method (e.g. cd project_path && python _agent_eval/main.py) → parse score. + """ + timeout_ms = resolve_timeout_ms(timeout_ms) + if not skip_prereq_check and not _check_host_prerequisites(): + raise RuntimeError('Host prerequisites check failed') + apply_timeout_env(timeout_ms) + _write_claude_settings(timeout_ms) + + project_path = os.path.abspath(project_path) + if not os.path.isdir(project_path): + raise RuntimeError(f'Project path does not exist: {project_path}') + + # 1. Run agent + agent_result = await run_agent( + model, + task, + env='local', + artifact_path=project_path, + timeout_ms=timeout_ms, + interactive=interactive, + enable_skill=enable_skill, + enable_subagent=enable_subagent, + ) + agent_output = agent_result['output'] + agent_status = status_from_exit_code(agent_result['exit_code']) + + # 2. Run evaluation script if provided + if test_method and test_method.strip(): + try: + # Evaluator from JSONL is a path to main.py; run with python from project root. + if test_method.strip().endswith('.py'): + eval_cmd = f'cd {project_path} && python {test_method.strip()}' + else: + eval_cmd = f'cd {project_path} && {test_method}' + eval_result = subprocess.run( + eval_cmd, + shell=True, + capture_output=True, + text=True, + timeout=_EVAL_SCRIPT_TIMEOUT_SEC, + ) + test_output = (eval_result.stdout or '').strip() + score = parse_eval_score(test_output) + status = 'success' if agent_status == 'success' else agent_status + except subprocess.TimeoutExpired: + test_output = '(evaluation script timed out)' + score = 0 + status = 'error' + except Exception as e: + test_output = str(e) + score = 0 + status = f'error: {e}' + else: + test_output = '' + score = 0 + status = agent_status + + return _make_eval_result( + task_id, + task, + project_path, + agent_output, + status, + run_on_host=True, + score=score, + test_method=test_method or '', + ) + + +def run_agent_then_eval( + project_path: str, + task_id: str, + task: str, + model: str, + test_method: str | None, + save_path: str, + timeout_ms: int | None = None, + *, + skip_prereq_check: bool = False, + interactive: bool = False, + enable_skill: bool = False, + enable_subagent: bool = False, +) -> dict: + """Synchronous entry: run agent on host then evaluation script; return result with score. + + Called by arteval_bench run_eval_in_env.run_eval_on_host when agent is ae_agent. + """ + return asyncio.run( + _run_agent_then_eval_async( + project_path, + task_id, + task, + model, + test_method, + save_path, + timeout_ms=timeout_ms, + skip_prereq_check=skip_prereq_check, + interactive=interactive, + enable_skill=enable_skill, + enable_subagent=enable_subagent, + ) + ) + + +# --------------------------------------------------------------------------- +# Docker helpers +# --------------------------------------------------------------------------- + + +def _validate_agent_path(agent_path: str) -> None: + """Ensure agent_path exists and has required files.""" + if not agent_path or not os.path.isdir(agent_path): + raise RuntimeError(f'Agent path does not exist: {agent_path}') + for name in ('runner.sh', 'runner.py', 'install.sh'): + if not os.path.isfile(os.path.join(agent_path, name)): + raise RuntimeError(f'Agent path missing required file: {name}') + + +def _stdin_is_tty() -> bool: + """Return True if stdin is a real TTY (required for docker exec -it).""" + return hasattr(sys.stdin, 'isatty') and sys.stdin.isatty() + + +def _run_docker_cmd( + args: list[str], + *, + timeout: int = 60, + on_success_message: str | None = None, + on_fail_message: str = 'docker command failed', +) -> bool: + """Run a docker subprocess. Return True if returncode is 0, else False and log.""" + try: + r = subprocess.run( + args, + capture_output=True, + text=True, + timeout=timeout, + ) + if r.returncode == 0: + if on_success_message: + print(on_success_message) + return True + logging.warning('%s: %s', on_fail_message, (r.stderr or r.stdout or '').strip()) + return False + except subprocess.TimeoutExpired: + logging.warning('docker command timed out (timeout=%ds)', timeout) + return False + except (OSError, subprocess.SubprocessError) as e: + logging.warning('docker command error: %s', e) + return False + + +def _merge_tree(src_dir: str, dst_dir: str, exclude: tuple[str, ...] = ('.venv', '.git', '__pycache__')) -> None: + """Merge src_dir into dst_dir (copy missing/updated from src into dst).""" + os.makedirs(dst_dir, exist_ok=True) + for name in os.listdir(src_dir): + if name in exclude: + continue + src_path = os.path.join(src_dir, name) + dst_path = os.path.join(dst_dir, name) + if os.path.isdir(src_path): + if os.path.isdir(dst_path): + _merge_tree(src_path, dst_path, exclude) + elif os.path.exists(dst_path): + logging.warning('Sync skip (destination not a dir): %s', dst_path) + else: + shutil.copytree(src_path, dst_path) + else: + try: + shutil.copy2(src_path, dst_path) + except OSError as e: + logging.warning('Sync copy failed %s -> %s: %s', src_path, dst_path, e) + + +def _sync_workspace(container_id: str, project_path: str) -> None: + """Copy /repo from container back to host project_path. + + Uses a temp copy plus merge with excludes to avoid overwriting host .venv + (e.g. when container has .venv/lib64 as a directory and host has it as a + symlink, which would cause 'cannot overwrite non-directory with directory'). + """ + project_abs = os.path.abspath(project_path) + if not os.path.isdir(project_abs): + print(f'WARNING: project_path missing, skipping sync: {project_abs}') + return + + # Exclude .venv* and .git to avoid overwriting host venv or permission issues + def _skip_sync(name: str) -> bool: + return name == '.git' or name == '.venv' or name.startswith('.venv-') + + with tempfile.TemporaryDirectory(prefix='ae_sync_') as tmp: + dest_tmp = os.path.join(tmp, 'repo') + if not _run_docker_cmd( + ['docker', 'cp', f'{container_id}:/repo', dest_tmp], + timeout=600, + on_fail_message='docker cp (to temp) failed', + ): + return + # docker cp container:/repo dest_tmp puts repo contents into dest_tmp + repo_src = dest_tmp + for name in os.listdir(repo_src): + if _skip_sync(name): + continue + src_path = os.path.join(repo_src, name) + dst_path = os.path.join(project_abs, name) + try: + if os.path.isdir(src_path): + if os.path.exists(dst_path): + _merge_tree(src_path, dst_path) + else: + shutil.copytree(src_path, dst_path) + else: + shutil.copy2(src_path, dst_path) + except (OSError, shutil.Error) as e: + logging.warning('Sync item %s failed: %s', name, e) + print(f'Synced container /repo -> {project_abs}') + + +def _commit_container(container_id: str, task_id: str) -> str | None: + """Commit container state as a Docker image. Returns image tag or None.""" + sid = safe_task_id(task_id, fallback='unknown_task') + image_tag = f'ae-agent-{sid.lower()}:latest' + if not _run_docker_cmd( + ['docker', 'commit', container_id, image_tag], + timeout=600, + on_fail_message='docker commit failed', + ): + return None + return image_tag + + +def _stop_container(container_id: str) -> bool: + """Stop a Docker container. Returns True if stopped successfully.""" + return _run_docker_cmd( + ['docker', 'stop', container_id], + timeout=60, + on_success_message=f'Stopped container {container_id}.', + on_fail_message='docker stop failed', + ) + + +def _save_container( + container_id: str, + project_path: str, + task_id: str, +) -> tuple[str | None, bool]: + """Sync workspace, commit image, and stop container.""" + _sync_workspace(container_id, project_path) + image_tag = _commit_container(container_id, task_id) + stopped = _stop_container(container_id) + return image_tag, stopped + + +def save_container_after_run(container_id: str, project_path: str, task_id: str) -> tuple[str | None, bool]: + """Sync workspace from container to host, commit as image, stop container. + + Public entry for run_eval_in_env when keep_container=False (original artifact-agent behavior). + Returns (saved_image_tag, container_stopped). + """ + return _save_container(container_id, project_path, task_id) + + +async def _get_container_id(runtime) -> str | None: + """Get container hostname/ID from inside the container.""" + try: + cid = ( + await _run_bash( + runtime, + 'cat /etc/hostname 2>/dev/null || hostname 2>/dev/null || echo ""', + 10.0, + ) + ).strip() + return cid if cid and cid != 'unknown' else None + except (AttributeError, TypeError, ValueError) as e: + logging.debug('Could not get container ID: %s', e) + return None + + +def _shell_escape(s: str) -> str: + """Escape a string for use inside single-quoted shell arguments.""" + return s.replace("'", "'\"'\"'") + + +def _build_api_env_dict( + timeout_ms: int, + *, + enable_skill: bool = False, + enable_subagent: bool = False, +) -> dict[str, str]: + """Build env vars dict for API keys, Foundry, timeouts, and SDK options. + + Single source of truth for _docker_exec_env_args and _setup_container_env. + """ + api_key = os.environ.get('ANTHROPIC_API_KEY') + foundry_key = os.environ.get('ANTHROPIC_FOUNDRY_API_KEY') + env = dict(timeout_env_dict(timeout_ms)) + if api_key: + env['ANTHROPIC_API_KEY'] = api_key + if foundry_key: + env['ANTHROPIC_FOUNDRY_API_KEY'] = foundry_key + if not api_key: + env['ANTHROPIC_API_KEY'] = foundry_key + foundry_url = os.environ.get('ANTHROPIC_FOUNDRY_BASE_URL') + if foundry_url: + env['ANTHROPIC_FOUNDRY_BASE_URL'] = foundry_url + if os.environ.get('CLAUDE_CODE_USE_FOUNDRY') == '1': + env['CLAUDE_CODE_USE_FOUNDRY'] = '1' + if enable_skill: + env['AE_ENABLE_SKILL'] = '1' + if enable_subagent: + env['AE_ENABLE_SUBAGENT'] = '1' + return env + + +def _docker_exec_env_args( + timeout_ms: int, + *, + enable_skill: bool = False, + enable_subagent: bool = False, +) -> list[str]: + """Build -e VAR=value args for docker exec (env vars needed by runner.py).""" + env = _build_api_env_dict( + timeout_ms, + enable_skill=enable_skill, + enable_subagent=enable_subagent, + ) + args = [] + for k, v in env.items(): + args.extend(['-e', f'{k}={v}']) + return args + + +async def _upload_task(runtime, task: str, task_file_path: str | None): + """Upload task description to /agent/current_task.txt inside container.""" + tmpdir = tempfile.mkdtemp(prefix='ae_task_') + try: + dest = os.path.join(tmpdir, 'current_task.txt') + if task_file_path and os.path.isfile(task_file_path): + shutil.copy2(task_file_path, dest) + else: + with open(dest, 'w', encoding='utf-8') as f: + f.write(task) + await runtime.upload(UploadRequest(source_path=tmpdir, target_path='/agent_task_file')) + await _run_bash( + runtime, + 'cp /agent_task_file/current_task.txt /agent/current_task.txt', + 10.0, + ) + finally: + shutil.rmtree(tmpdir, ignore_errors=True) + + +async def _setup_container_env( + runtime, timeout_ms: int, *, enable_skill: bool = False, enable_subagent: bool = False +): + """Set timeout and API keys inside the container.""" + env = _build_api_env_dict( + timeout_ms, + enable_skill=enable_skill, + enable_subagent=enable_subagent, + ) + parts = [f"export {k}='{_shell_escape(v)}'" for k, v in env.items()] + await _run_bash(runtime, ' && '.join(parts)) + + if not has_api_key(): + logging.warning('No API key found. Runner may fail.') + + +def _extract_output(res) -> str: + """Extract output string from swe-rex/bash action result.""" + return str(getattr(res, 'output', '')).strip() + + +async def _run_bash(runtime, command: str, timeout: float = 10.0) -> str: + """Run a Bash command in the container session and return its output. Reduces duplication.""" + res = await runtime.run_in_session(BashAction(command=command, timeout=timeout)) + return _extract_output(res) + + +async def _start_runner_background(runtime, model: str) -> str | None: + """Start runner.sh in background, return pid or None.""" + await _run_bash( + runtime, + 'rm -f /agent/runner.live.log && touch /agent/runner.live.log', + 10.0, + ) + output = await _run_bash( + runtime, + ( + f'stdbuf -oL -eL /agent/runner.sh "{model}" /agent/current_task.txt ' + f'> /agent/runner.live.log 2>&1 & ' + f'RUNNER_PID=$!; sleep 1; echo RUNNER_PID=$RUNNER_PID' + ), + 30.0, + ) + pid = None + for line in output.split('\n'): + if 'RUNNER_PID=' in line: + pid = line.split('RUNNER_PID=', 1)[1].strip() + break + if not pid or not pid.strip().isdigit(): + await asyncio.sleep(2) + pid = await _run_bash( + runtime, + "ps aux | grep '[r]unner.py' | awk '{print $2}' | head -1", + 10.0, + ) + pid = (pid or '').strip() + return pid if pid.isdigit() else None + + +async def _read_runner_log(runtime, elapsed: float, last_log: str) -> str: + """Read live log and print new content. Returns updated last_log.""" + try: + cur = await _run_bash(runtime, 'cat /agent/runner.live.log 2>/dev/null || echo ""', 30.0) + if cur and cur != last_log: + new = cur[len(last_log) :].strip() if cur.startswith(last_log) else cur + if new: + print(f'[log @ {elapsed:.0f}s]\n{new}', flush=True) + return cur + except (AttributeError, TypeError, ValueError) as e: + logging.debug('Log read error: %s', e) + return last_log + + +async def _check_runner_exited(runtime, pid: str | None) -> _RunnerResult | None: + """Check if runner process has exited. Returns _RunnerResult if exited, else None.""" + if pid and pid.isdigit(): + ps_out = await _run_bash(runtime, f'ps -p {pid} >/dev/null 2>&1; echo $?', 10.0) + if ps_out.strip() != '0': + code = await _run_bash(runtime, f'wait {pid} 2>/dev/null; echo $?', 30.0) + ec = int(code.strip()) if code.strip().isdigit() else -1 + return _RunnerResult(exit_code=ec, output=f'exit_code={ec}') + else: + # PID was never captured (e.g. RUNNER_PID parse failed); detect exit by process count. + cnt = await _run_bash(runtime, "ps aux | grep '[r]unner.py' | wc -l", 10.0) + if not cnt.strip().isdigit() or int(cnt.strip()) == 0: + return _RunnerResult(exit_code=-1, output='exit_code=unknown') + return None + + +async def _handle_runner_timeout(runtime, pid: str | None) -> None: + """Kill runner and print log tail on timeout.""" + if pid and pid.isdigit(): + try: + await _run_bash( + runtime, + f'kill -TERM {pid} 2>/dev/null || kill -9 {pid} 2>/dev/null || true', + 10.0, + ) + except (AttributeError, TypeError, ConnectionError) as e: + logging.debug('Kill runner failed: %s', e) + try: + tail_str = await _run_bash(runtime, 'tail -n 200 /agent/runner.live.log', 30.0) + print(f'Log tail (timeout):\n{tail_str}', flush=True) + except (AttributeError, TypeError, ValueError) as e: + logging.debug('Could not read log tail: %s', e) + + +async def _monitor_runner(runtime, model: str, timeout_s: float) -> _RunnerResult: + """Start runner.sh in background and poll logs until it finishes or times out.""" + pid = await _start_runner_background(runtime, model) + print(f'Runner started (pid={pid})', flush=True) + + start = time.monotonic() + last_log = '' + last_progress_at = 0.0 + + while True: + elapsed = time.monotonic() - start + if elapsed >= timeout_s: + break + + last_log = await _read_runner_log(runtime, elapsed, last_log) + if elapsed - last_progress_at >= _PROGRESS_LOG_INTERVAL_SEC: + print(f'[still running @ {elapsed:.0f}s]', flush=True) + last_progress_at = elapsed + + result = await _check_runner_exited(runtime, pid) + if result is not None: + print(f'Runner finished (exit_code={result.exit_code})', flush=True) + return result + + await asyncio.sleep(_POLL_INTERVAL_SEC) + + await _handle_runner_timeout(runtime, pid) + raise TimeoutError(f'Runner exceeded timeout {timeout_s}s') + + +# --------------------------------------------------------------------------- +# Docker mode +# --------------------------------------------------------------------------- + + +async def _run_interactive_in_container( + container_id: str, + task_id: str, + task: str, + project_path: str, + model: str, + timeout_ms: int, + *, + enable_skill: bool = False, + enable_subagent: bool = False, +) -> dict: + """Run task + interactive in foreground via docker exec -it. + + The same agent session handles both task and follow-up, preserving context. + """ + print( + '\n' + + '=' * 60 + + '\nTask + interactive mode (foreground, context preserved).\n' + + "Type 'quit' or 'exit' to end the interactive session.\n" + + '=' * 60, + flush=True, + ) + exec_args = [ + 'docker', + 'exec', + '-it', + *_docker_exec_env_args( + timeout_ms, + enable_skill=enable_skill, + enable_subagent=enable_subagent, + ), + container_id, + 'python3', + '-u', + '/agent/runner.py', + model, + '/agent/current_task.txt', + '--interactive', + ] + try: + proc = await asyncio.to_thread( + subprocess.run, + exec_args, + stdin=sys.__stdin__, + stdout=sys.__stdout__, + stderr=sys.__stderr__, + ) + run_exit_code = proc.returncode + except (OSError, subprocess.SubprocessError) as e: + logging.warning('Foreground execution failed for task %s: %s', task_id, e) + run_exit_code = 1 + + return _make_eval_result( + task_id, + task, + project_path, + f'Interactive session (exit_code={run_exit_code})', + status_from_exit_code(run_exit_code), + run_on_host=False, + ) + + +async def _run_in_docker( # noqa: C901 + deployment, + project_path, + task_id, + task, + model, + agent_path, + _save_path: str, + timeout_ms: int, + *, + task_file_path: str | None = None, + interactive: bool = False, + enable_skill: bool = False, + enable_subagent: bool = False, +) -> dict: + """Run task inside a Docker container. + + _save_path: Unused in Docker path (results are returned to main.py which writes reports). + Kept for a consistent run_eval() → _run_in_docker() API. + """ + if not SWEREX_AVAILABLE: + raise RuntimeError('swerex is not available.') + + _validate_agent_path(agent_path) + await deployment.start() + runtime = deployment.runtime + + timeout_s = timeout_ms / 1000.0 + # swe-rex doesn't expose a public API for session-level timeout; + # override the internal config as a workaround. + if hasattr(runtime, '_config'): + runtime._config.timeout = timeout_s + + await runtime.create_session(CreateBashSessionRequest()) + + print('Uploading project files...', flush=True) + await runtime.upload(UploadRequest(source_path=project_path, target_path='/repo')) + await _run_bash(runtime, 'cd /repo') + + print('Uploading agent scripts...', flush=True) + await runtime.upload(UploadRequest(source_path=agent_path, target_path='/agent')) + await _run_bash( + runtime, + 'chmod +x /agent/runner.sh /agent/install.sh 2>/dev/null; /agent/install.sh', + 120.0, # install.sh may run pip install; allow up to 2 minutes + ) + + await _upload_task(runtime, task, task_file_path) + await _setup_container_env( + runtime, timeout_ms, enable_skill=enable_skill, enable_subagent=enable_subagent + ) + + container_id = await _get_container_id(runtime) + result = None + + try: + # Prefer foreground interactive when container_id is available and stdin is a TTY. + if interactive and container_id and _stdin_is_tty(): + result = await _run_interactive_in_container( + container_id, task_id, task, project_path, model, timeout_ms, + enable_skill=enable_skill, enable_subagent=enable_subagent, + ) + else: + if interactive and not _stdin_is_tty(): + print( + 'WARNING: Interactive mode requires a terminal (TTY). Running task in non-interactive mode.', + flush=True, + ) + elif interactive and not container_id: + print( + 'WARNING: Cannot get container ID; falling back to non-interactive mode.', + flush=True, + ) + # Background run: start runner, poll logs, then return result. + run_results = await _monitor_runner(runtime, model, timeout_s) + print(f'Runner result: {run_results}', flush=True) + result = _make_eval_result( + task_id, + task, + project_path, + run_results.output, + status_from_exit_code(run_results.exit_code), + run_on_host=False, + ) + except Exception as e: + logging.error('Task %s error: %s', task_id, e, exc_info=True) + result = _make_eval_result( + task_id, + task, + project_path, + str(e), + 'error', + run_on_host=False, + ) + finally: + if not container_id: + container_id = await _get_container_id(runtime) + + saved_image, stopped = None, False + if container_id: + try: + saved_image, stopped = _save_container(container_id, project_path, task_id) + except (OSError, subprocess.SubprocessError) as e: + logging.warning('Save container failed: %s', e) + + try: + await deployment.stop() + except Exception as e: + # Container may already be stopped; deployment.close() can fail with + # ClientConnectorError when the remote service port is gone. + logging.warning('deployment.stop() failed for task %s: %s', task_id, e) + + if result is None: + # Exception occurred before any result was set (e.g. before try body ran + # or a BaseException was raised). Ensure we always have a dict for update/return. + result = _make_eval_result( + task_id, + task, + project_path, + 'Execution interrupted or failed before result was set.', + 'error', + run_on_host=False, + ) + result.update( + container_id=container_id, + saved_image=saved_image, + container_stopped=stopped, + ) + + return result + + +# --------------------------------------------------------------------------- +# Public entry point +# --------------------------------------------------------------------------- + + +def run_eval( + env: str, + project_path: str, + task_id: str, + task: str, + model: str, + agent_path: str, + save_path: str, + docker_image: str | None = None, + timeout_ms: int | None = None, + *, + skip_prereq_check: bool = False, + use_gpu: bool = False, + task_file_path: str | None = None, + interactive: bool = False, + enable_skill: bool = False, + enable_subagent: bool = False, +) -> dict: + """Run task in the given environment (local host or Docker). + + Single entry point — called from main.py for each JSONL task. + """ + timeout_ms = resolve_timeout_ms(timeout_ms) + if is_local_env(env): + apply_timeout_env(timeout_ms) # Docker mode uses container env only; no host env. + print(f'Task {task_id}: HOST (timeout={timeout_ms}ms, interactive={interactive})') + return asyncio.run( + _run_local( + project_path, + task_id, + task, + model, + timeout_ms, + skip_prereq_check=skip_prereq_check, + interactive=interactive, + enable_skill=enable_skill, + enable_subagent=enable_subagent, + ) + ) + + if not SWEREX_AVAILABLE: + raise RuntimeError('SWE-ReX not available. Install swe-rex for Docker mode.') + + image = docker_image or DEFAULT_DOCKER_IMAGE + docker_args = [ + '--privileged', + '--cgroupns=host', + '-e', + 'KIND_EXPERIMENTAL_CONTAINERD_SNAPSHOTTER=native', + ] + if use_gpu: + docker_args.extend(['--gpus', 'all']) + + config = DockerDeploymentConfig( + image=image, + startup_timeout=1200.0, + docker_args=docker_args, + ) + deployment = config.get_deployment() + + gpu_note = ' (GPU)' if use_gpu else '' + print(f'Task {task_id}: DOCKER (image={image}, timeout={timeout_ms}ms){gpu_note}') + return asyncio.run( + _run_in_docker( + deployment, + project_path, + task_id, + task, + model, + agent_path, + save_path, + timeout_ms, + task_file_path=task_file_path, + interactive=interactive, + enable_skill=enable_skill, + enable_subagent=enable_subagent, + ) + ) diff --git a/benchmarks/arteval_bench/src/agents/ae_agent/runner.py b/benchmarks/arteval_bench/src/agents/ae_agent/runner.py new file mode 100644 index 00000000..4393e5e3 --- /dev/null +++ b/benchmarks/arteval_bench/src/agents/ae_agent/runner.py @@ -0,0 +1,486 @@ +#!/usr/bin/env python3 +"""Core agent execution using Claude Agent SDK. + +Works both as a package module (imported by run_eval for host mode) and as a +standalone script (uploaded to Docker container and run via runner.sh). + +Provides: +- build_system_prompt(): unified prompt builder for all environments +- run_agent(): single implementation of SDK invocation with rate-limit retry +- docker_main(): standalone Docker entry point +""" + +from __future__ import annotations + +import asyncio +import logging +import os +import re +import sys + +logger = logging.getLogger(__name__) + +# Import utils: as package module or standalone in Docker. +try: + from .utils import ( + DEFAULT_MODEL, + DEFAULT_TIMEOUT_MS, + has_api_key, + is_local_env, + resolve_timeout_ms, + ) +except (ImportError, SystemError): + sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) + try: + from utils import ( + DEFAULT_MODEL, + DEFAULT_TIMEOUT_MS, + has_api_key, + is_local_env, + resolve_timeout_ms, + ) + except ImportError: + # Fallback when utils is not importable (e.g. container has only runner.py). + # Duplication intentional; single source is utils.py. Update both if default changes. + DEFAULT_TIMEOUT_MS = 345_600_000 # 96h + DEFAULT_MODEL = 'claude-sonnet-4-5-20250929' + + def is_local_env(env: str) -> bool: # noqa: D103 + return str(env).strip().lower() == 'local' + + def has_api_key() -> bool: # noqa: D103 + return bool(os.environ.get('ANTHROPIC_API_KEY') or os.environ.get('ANTHROPIC_FOUNDRY_API_KEY')) + + def resolve_timeout_ms(timeout_ms: int | None) -> int: # noqa: D103 + return timeout_ms if timeout_ms is not None else DEFAULT_TIMEOUT_MS + + +try: + from claude_agent_sdk import ClaudeAgentOptions, query + + CLAUDE_SDK_AVAILABLE = True +except ImportError: + CLAUDE_SDK_AVAILABLE = False + +try: + from claude_agent_sdk import ClaudeSDKClient +except ImportError: + ClaudeSDKClient = None + +_RATE_LIMIT_MAX_RETRIES = 5 +_RATE_LIMIT_WAIT_SEC = 60 +_RATE_LIMIT_WAIT_MAX_SEC = 600 +_RATE_LIMIT_WRAPPED_MAX_RETRIES = 3 +_PROGRESS_LOG_INTERVAL = 10 + + +_RESULT_TYPE_NAMES = frozenset({'ResultMessage', 'TextBlock'}) + + +def _process_message(message, message_count: int, result_text: str) -> tuple[int, str]: + """Process one SDK message: print, update count, extract result text. + + Returns (new_message_count, new_result_text). + """ + message_count += 1 + if message_count % _PROGRESS_LOG_INTERVAL == 0: + print(f'[Progress] {message_count} messages...', flush=True) + msg_str = str(message) + print(msg_str, flush=True) + if type(message).__name__ in _RESULT_TYPE_NAMES: + result_text = msg_str + return message_count, result_text + + +def _is_rate_limit_error(exc: BaseException) -> bool: + msg = str(exc).lower() + return '429' in msg or 'rate limit' in msg or 'ratelimitreached' in msg + + +def _is_possible_wrapped_rate_limit(exc: BaseException) -> bool: + msg = str(exc).lower() + return ('command failed' in msg and 'exit code 1' in msg) or 'check stderr' in msg + + +def _parse_retry_after(exc: BaseException) -> int | None: + m = re.search(r'wait\s+(\d+)\s*seconds', str(exc), re.I) + return int(m.group(1)) if m else None + + +def _parse_env_bool(env_var: str, default: bool = False) -> bool: + """Parse env var as bool. '1', 'true', 'yes' -> True.""" + v = os.environ.get(env_var, '').strip().lower() + return v in ('1', 'true', 'yes') if v else default + + +# Shared prompt fragments +_PROMPT_TIMEOUT_HOST = ( + 'TIMEOUT CONFIGURATION (CRITICAL):\n' + '- Long-running commands (builds, tests, Kind cluster creation) are expected\n' + '- DO NOT set short timeouts - let commands complete naturally\n\n' +) +_PROMPT_TIMEOUT_DOCKER = ( + 'TIMEOUT CONFIGURATION (CRITICAL):\n' + '- The system has been configured with a Bash timeout of {timeout_ms} ms.\n' + '- DO NOT specify timeout parameters in your Bash commands.\n' + '- Long-running commands can take hours - this is normal.\n' + '- If a command seems to be running long, DO NOT cancel or re-run it.\n\n' +) +_PROMPT_VERIFY_STEPS = ( + 'You MUST execute every verification step the README requires. Do NOT skip steps because they take a long time.\n' +) + + +def build_system_prompt( + task: str, + *, + env: str = 'docker', + artifact_path: str | None = None, + timeout_ms: int | None = None, +) -> str: + """Build system prompt, parameterized by execution environment. + + Args: + task: Task description text. + env: 'local' for host execution, anything else for Docker. + artifact_path: Path to artifact directory (used in host mode prompt). + timeout_ms: Bash timeout in ms (shown in Docker mode prompt). + """ + timeout_ms = resolve_timeout_ms(timeout_ms) + + if is_local_env(env): + path = artifact_path or '.' + return ( + 'You are an experienced software engineer completing an artifact task.\n\n' + 'ENVIRONMENT SETUP (HOST MACHINE):\n' + '- You are running DIRECTLY on the host machine (NOT inside a Docker container)\n' + '- Docker daemon is already running on this host\n' + '- You may need sudo for some operations\n\n' + f'ARTIFACT LOCATION:\n' + f'- The artifact repository is located at: {path}\n' + f'- Start by changing to this directory: cd {path}\n\n' + f'YOUR TASK:\n{task}\n\n' + _PROMPT_TIMEOUT_HOST + 'IMPORTANT GUIDELINES:\n' + f'1. First, cd to {path} and examine the directory structure\n' + '2. Follow the README instructions step by step\n' + f'3. {_PROMPT_VERIFY_STEPS}' + "4. If you see 'sudo' in instructions, you can use it (or skip if already root)\n" + '5. Use the Bash tool to run commands, Read tool to inspect files\n' + '6. Work systematically through setup, build, and experiment execution\n' + '7. If you encounter errors, debug and resolve them using available tools\n' + "8. For Kind clusters, they will work properly since you're on the host (not DinD)" + ) + + # Docker/container: when running under arteval_bench, artifact is at /repo + path_hint = '' + if artifact_path: + path_hint = f'- The artifact repository is at: {artifact_path}. Change to it: cd {artifact_path}\n' + else: + path_hint = ( + '- The artifact repository should be in the current working directory or nearby.\n' + '- Explore the directory structure to find the artifact repository.\n' + ) + + return ( + 'You are an experienced software engineer.\n\n' + 'ENVIRONMENT SETUP:\n' + '- You are running inside a Docker container with root permissions.\n' + f'{path_hint}' + '- You have access to Read, Write, and Bash tools.\n\n' + f'YOUR TASK:\n{task}\n\n' + _PROMPT_TIMEOUT_DOCKER.format(timeout_ms=timeout_ms) + 'IMPORTANT GUIDELINES:\n' + '1. First, explore the current directory structure\n' + '2. Navigate to the artifact repository root directory\n' + "3. If you see 'sudo' in instructions, remove it (you already have root access)\n" + '4. Do NOT attempt to switch git branches\n' + '5. Follow the README instructions step by step\n' + f'6. {_PROMPT_VERIFY_STEPS}' + '7. Use the Bash, Read, and Write tools to complete the task\n' + '8. Work systematically through setup, build, and experiment execution\n' + '9. If you encounter errors, debug and resolve them' + ) + + +async def run_agent( # noqa: C901 + model_name: str, + task: str, + *, + system_prompt: str | None = None, + env: str = 'docker', + artifact_path: str | None = None, + timeout_ms: int | None = None, + interactive: bool = False, + enable_skill: bool = False, + enable_subagent: bool = False, +) -> dict: + """Run the agent using Claude SDK. Single implementation for all modes. + + Args: + model_name: Claude model name (e.g. claude-sonnet-4-5-20250929) + task: Task description + system_prompt: If provided, use directly; otherwise built from env/artifact_path/task. + env: 'local' for host, else docker. Used to build prompt when system_prompt is None. + artifact_path: Artifact directory path (for prompt and initial message). + timeout_ms: Bash timeout in ms. + interactive: If True, enter interactive multi-turn loop after initial task. + enable_skill: If True, enable Claude Agent SDK Skill (load from ~/.claude/skills/). + enable_subagent: If True, enable Claude Agent SDK Sub-agent (Task tool). + + Returns: + dict with keys: exit_code (int), output (str), message_count (int) + """ + if not CLAUDE_SDK_AVAILABLE: + raise RuntimeError('claude_agent_sdk is not available. Install with: pip install claude-agent-sdk') + + timeout_ms = resolve_timeout_ms(timeout_ms) + if system_prompt is None: + system_prompt = build_system_prompt(task, env=env, artifact_path=artifact_path, timeout_ms=timeout_ms) + + allowed_tools = ['Read', 'Write', 'Bash'] + if enable_skill: + allowed_tools.append('Skill') + if enable_subagent: + allowed_tools.append('Task') + setting_sources = ['user', 'project'] if enable_skill else ['user'] + + options = ClaudeAgentOptions( + model=model_name, + system_prompt=system_prompt, + allowed_tools=allowed_tools, + setting_sources=setting_sources, + ) + + initial_prompt = ( + f'Please start the artifact task. Begin by changing to the artifact ' + f'directory at {artifact_path} and examining its contents.' + if artifact_path + else 'Please start working on the artifact task. Begin by examining ' + 'the current directory and finding the artifact repository.' + ) + + print(f'\n{"=" * 60}', flush=True) + print(f'Starting Claude Agent SDK with model: {model_name}', flush=True) + print(f'{"=" * 60}\n', flush=True) + + message_count = 0 + result_text = '' + + if interactive: + if ClaudeSDKClient is None: + raise RuntimeError('ClaudeSDKClient not available; cannot run interactive mode.') + async with ClaudeSDKClient(options=options) as client: + await client.query(initial_prompt) + async for message in client.receive_response(): + message_count, result_text = _process_message(message, message_count, result_text) + + print(f'\nInitial task done ({message_count} messages).', flush=True) + print('\n' + '=' * 60, flush=True) + print( + "Interactive mode — type instructions (or 'quit'/'exit' to end).", + flush=True, + ) + print('=' * 60 + '\n', flush=True) + + while True: + try: + user_input = input('\n>>> ').strip() + except (EOFError, KeyboardInterrupt): + print('\nExiting interactive mode.', flush=True) + break + if not user_input: + continue + if user_input.lower() in ('quit', 'exit', 'q'): + print('Exiting interactive mode.', flush=True) + break + await client.query(user_input) + async for msg in client.receive_response(): + message_count, result_text = _process_message(msg, message_count, result_text) + + return { + 'exit_code': 0 if message_count > 0 else 1, + 'output': result_text, + 'message_count': message_count, + } + + # Non-interactive with rate-limit retry + last_exception = None + for attempt in range(1, _RATE_LIMIT_MAX_RETRIES + 1): + try: + result_text = '' + message_count = 0 + async for message in query(prompt=initial_prompt, options=options): + message_count, result_text = _process_message(message, message_count, result_text) + + print(f'Completed. Total messages: {message_count}', flush=True) + return { + 'exit_code': 0, + 'output': result_text, + 'message_count': message_count, + } + + except asyncio.TimeoutError as e: + logger.error('Timed out: %s', e) + return { + 'exit_code': 1, + 'output': f'Timeout: {e}', + 'message_count': message_count, + } + except Exception as e: + last_exception = e + explicit = _is_rate_limit_error(e) + wrapped = _is_possible_wrapped_rate_limit(e) and not explicit + max_r = _RATE_LIMIT_MAX_RETRIES if explicit else _RATE_LIMIT_WRAPPED_MAX_RETRIES + if (explicit or wrapped) and attempt < max_r: + parsed = _parse_retry_after(e) + wait = ( + min(parsed, _RATE_LIMIT_WAIT_MAX_SEC) + if parsed + else min( + _RATE_LIMIT_WAIT_SEC * (2 ** (attempt - 1)), + _RATE_LIMIT_WAIT_MAX_SEC, + ) + ) + logger.warning( + 'Rate limit. Waiting %ds (attempt %d/%d)...', + wait, + attempt, + max_r, + ) + await asyncio.sleep(wait) + continue + logger.error('%s', e, exc_info=True) + return { + 'exit_code': 1, + 'output': f'Error: {e}', + 'message_count': message_count, + } + + return { + 'exit_code': 1, + 'output': f'Failed after {_RATE_LIMIT_MAX_RETRIES} attempts: {last_exception}', + 'message_count': 0, + } + + +# --------------------------------------------------------------------------- +# Standalone entry point (Docker container via runner.sh) +# --------------------------------------------------------------------------- + + +def _ensure_api_key() -> None: + """Ensure at least one API key is set; exit with error otherwise.""" + if has_api_key(): + return + logger.error('API key not set. Set ANTHROPIC_API_KEY or ANTHROPIC_FOUNDRY_API_KEY.') + sys.exit(1) + + +_INTERACTIVE_SYSTEM_PROMPT = """\ +You are an experienced software engineer in an interactive session. + +ENVIRONMENT: +- You are inside a Docker container with root permissions. +- The artifact repository is at /repo. Change to it: cd /repo +- You have access to Read, Write, and Bash tools. + +TIMEOUT: Long-running commands can take hours; do not set short timeouts. + +You will receive follow-up instructions from the user. Complete each one and respond. +If the user asks to stop or says 'quit'/'exit', acknowledge and they will end the session.""" + +# When running under arteval_bench, artifact is always at /repo +_ARTIFACT_PATH_IN_CONTAINER = '/repo' + + +def docker_main(): + """Standalone entry point for running inside a Docker container via runner.sh.""" + raw_args = sys.argv[1:] + interactive = '--interactive' in raw_args + args = [a for a in raw_args if a != '--interactive'] + + enable_skill = _parse_env_bool('AE_ENABLE_SKILL', False) + enable_subagent = _parse_env_bool('AE_ENABLE_SUBAGENT', False) + + # Mode 1 — interactive-only (no task): runner.py --interactive [model] + if interactive and len(args) <= 1: + model = args[0] if args else os.environ.get('AE_AGENT_MODEL', DEFAULT_MODEL) + _ensure_api_key() + result = asyncio.run( + run_agent( + model, + 'Please confirm you are in /repo and ready for follow-up instructions. Reply briefly.', + system_prompt=_INTERACTIVE_SYSTEM_PROMPT, + interactive=True, + enable_skill=enable_skill, + enable_subagent=enable_subagent, + ) + ) + sys.exit(result['exit_code']) + + # Mode 2 — task execution: runner.py [--interactive] + if len(args) != 2: + print( + 'Usage: python3 runner.py [--interactive]\n' + ' python3 runner.py --interactive [model]', + file=sys.stderr, + ) + sys.exit(1) + + model_name = args[0] + task_arg = args[1] + if os.path.isfile(task_arg): + with open(task_arg, encoding='utf-8') as f: + task = f.read() + else: + task = task_arg + + _ensure_api_key() + + try: + raw = os.environ.get('BASH_MAX_TIMEOUT_MS') + timeout_ms = int(raw) if raw else None + except ValueError: + timeout_ms = None + timeout_ms = resolve_timeout_ms(timeout_ms) + + # In container (arteval_bench): artifact is at /repo + artifact_path = _ARTIFACT_PATH_IN_CONTAINER if os.path.isdir(_ARTIFACT_PATH_IN_CONTAINER) else None + + try: + if interactive: + result = asyncio.run( + run_agent( + model_name, + task, + env='docker', + artifact_path=artifact_path, + timeout_ms=timeout_ms, + interactive=True, + enable_skill=enable_skill, + enable_subagent=enable_subagent, + ) + ) + else: + result = asyncio.run( + asyncio.wait_for( + run_agent( + model_name, + task, + env='docker', + artifact_path=artifact_path, + timeout_ms=timeout_ms, + enable_skill=enable_skill, + enable_subagent=enable_subagent, + ), + timeout=timeout_ms / 1000.0, + ) + ) + sys.exit(result['exit_code']) + except asyncio.TimeoutError: + logger.error('Agent exceeded timeout.') + sys.exit(1) + except Exception as e: + logger.error('%s', e, exc_info=True) + sys.exit(1) + + +if __name__ == '__main__': + docker_main() diff --git a/benchmarks/arteval_bench/src/agents/ae_agent/runner.sh b/benchmarks/arteval_bench/src/agents/ae_agent/runner.sh new file mode 100644 index 00000000..adf9bc69 --- /dev/null +++ b/benchmarks/arteval_bench/src/agents/ae_agent/runner.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +# Do not use set -e; some commands may return non-zero without indicating failure + +# Set the model and task as parameters (task can be text or path to file, e.g. /agent/current_task.txt) +if [ $# -ne 2 ]; then + echo "Usage: $0 " + echo "Example: $0 claude-sonnet-4-5-20250929 \"Install and run tests\"" + echo " $0 claude-sonnet-4-5-20250929 /agent/current_task.txt" + exit 1 +fi + +# Disable Python buffering for real-time log output +export PYTHONUNBUFFERED=1 + +# Claude Agent SDK Bash timeout: use env if set, else default 96h (must match Python utils.DEFAULT_TIMEOUT_MS = 345_600_000) +if [ -z "$BASH_MAX_TIMEOUT_MS" ]; then + export BASH_MAX_TIMEOUT_MS=345600000 +fi +if [ -z "$BASH_DEFAULT_TIMEOUT_MS" ]; then + export BASH_DEFAULT_TIMEOUT_MS="$BASH_MAX_TIMEOUT_MS" +fi + +# Invoke Python runner (-u for unbuffered output) +python3 -u /agent/runner.py "$1" "$2" diff --git a/benchmarks/arteval_bench/src/agents/ae_agent/utils.py b/benchmarks/arteval_bench/src/agents/ae_agent/utils.py new file mode 100644 index 00000000..89497b2f --- /dev/null +++ b/benchmarks/arteval_bench/src/agents/ae_agent/utils.py @@ -0,0 +1,451 @@ +"""Helper methods for running artifact tasks.""" + +from __future__ import annotations + +import json +import os +import re +import subprocess + +__all__ = [ + 'AGENT_SUMMARY_FALLBACK_MAX', + 'DEFAULT_DOCKER_IMAGE', + 'DEFAULT_MODEL', + 'DEFAULT_TIMEOUT_MS', + 'LOG_OUTPUT_TRUNCATE_BYTES', + 'SUMMARY_BASENAME_TEMPLATE', + 'SUMMARY_INSTRUCTION', + 'Tee', + 'apply_timeout_env', + 'clone_artifact_repo', + 'compute_and_write_summary', + 'parse_artifact_url', + 'docker_image_from_item', + 'env_from_item', + 'get_task', + 'gpu_from_item', + 'has_api_key', + 'interactive_from_item', + 'enable_skill_from_item', + 'enable_subagent_from_item', + 'is_local_env', + 'parse_eval_score', + 'read_task_from_file', + 'resolve_project_path', + 'resolve_timeout_ms', + 'safe_task_id', + 'status_from_exit_code', + 'timeout_env_dict', + 'timeout_ms_from_item', + 'write_task_report', +] + +# Default total timeout in milliseconds (96h); used by run_eval and runner. +# Single source: runner.py fallback and runner.sh (345600000) must match when utils is unavailable. +DEFAULT_TIMEOUT_MS = 345_600_000 + +# Default Docker image and model when not specified. +DEFAULT_DOCKER_IMAGE = 'bastoica/ae-agent-ubuntu24.04:latest' +DEFAULT_MODEL = 'claude-sonnet-4-5-20250929' + +# File naming templates for reports and summaries. +SUMMARY_BASENAME_TEMPLATE = 'ae_summary_{safe_id}.md' +SUMMARY_INSTRUCTION = ( + '\n\nAt the end, write a brief summary of what you did and the result to ' + '{basename} in the artifact root (so it can be included in the report).' +) +LOG_OUTPUT_TRUNCATE_BYTES = 50000 +AGENT_SUMMARY_FALLBACK_MAX = 8000 + + +def timeout_env_dict(timeout_ms: int) -> dict[str, str]: + """Return env vars dict for Bash timeout (single source for env and settings file).""" + return { + 'BASH_MAX_TIMEOUT_MS': str(timeout_ms), + 'BASH_DEFAULT_TIMEOUT_MS': str(timeout_ms), + } + + +def apply_timeout_env(timeout_ms: int) -> None: + """Set BASH_MAX_TIMEOUT_MS and BASH_DEFAULT_TIMEOUT_MS in os.environ.""" + os.environ.update(timeout_env_dict(timeout_ms)) + + +def resolve_timeout_ms(timeout_ms: int | None) -> int: + """Return timeout_ms if set, else DEFAULT_TIMEOUT_MS. Single place for default.""" + return timeout_ms if timeout_ms is not None else DEFAULT_TIMEOUT_MS + + +def has_api_key() -> bool: + """True if at least one of ANTHROPIC_API_KEY or ANTHROPIC_FOUNDRY_API_KEY is set.""" + return bool(os.environ.get('ANTHROPIC_API_KEY') or os.environ.get('ANTHROPIC_FOUNDRY_API_KEY')) + + +def status_from_exit_code(exit_code: int) -> str: + """Map process exit code to eval status string. Non-zero (incl. -1 for unknown) → 'error'.""" + return 'success' if exit_code == 0 else 'error' + + +def is_local_env(env: str) -> bool: + """True if env denotes local (host) execution rather than Docker.""" + return str(env).strip().lower() == 'local' + + +def _parse_bool_value(v, default: bool = False) -> bool: + """Parse a value (bool, str, or other) to bool. Strings 'true', '1', 'yes' → True.""" + if isinstance(v, bool): + return v + if isinstance(v, str): + return v.strip().lower() in ('true', '1', 'yes') + return bool(v) + + +# Default task template when artifact_readme is not specified. +_DEFAULT_TASK_TEMPLATE = ( + 'You are an experienced software engineer.' + ' You are asked to navigate to the {file_path} and follow step-by-step' + ' instructions to set up, install, compile, and reproduce the results in' + ' that code repository. You have root access inside a Docker image, which' + ' means you can directly proceed with executing the steps in the README' + ' without asking for approval or confirmation. Once you reached the end' + ' of the README you must exit the Docker image gracefully.' +) + + +def interactive_from_item(item: dict) -> bool: + """Whether to enable interactive mode (user can continue giving agent instructions after task completes).""" + return _parse_bool_value(item.get('interactive', False)) + + +def enable_skill_from_item(item: dict, default: bool = False) -> bool: + """Whether to enable Claude Agent SDK Skill (load from ~/.claude/skills/ and .claude/skills/).""" + return _parse_bool_value(item.get('enable_skill', default)) + + +def enable_subagent_from_item(item: dict, default: bool = False) -> bool: + """Whether to enable Claude Agent SDK Sub-agent (Task tool).""" + return _parse_bool_value(item.get('enable_subagent', default)) + + +def safe_task_id(task_id: str | None, fallback: str = 'unknown') -> str: + """Normalize task_id for use in filenames (no spaces, lowercase).""" + return (task_id or fallback).replace(' ', '_').lower() + + +def timeout_ms_from_item(item: dict) -> int | None: + """Parse timeout from task item. Returns ms (int) or None for default. + + Accepts either ``timeout_sec`` (seconds, preferred) or ``timeout_ms`` + (milliseconds). Falls back to the legacy ``timeout`` field, which is + treated as seconds if < 86_400 (24 hours), otherwise milliseconds. + """ + if 'timeout_sec' in item: + v = item['timeout_sec'] + if isinstance(v, (int, float)): + return int(v * 1000) + return None + if 'timeout_ms' in item: + v = item['timeout_ms'] + if isinstance(v, (int, float)): + return int(v) + return None + v = item.get('timeout', None) + if v is None: + return None + if isinstance(v, (int, float)): + # Legacy heuristic: 86400 = 24h in seconds; values below are treated as + # seconds, else as milliseconds (e.g. 345600000 = 96h). + return int(v * 1000) if v < 86_400 else int(v) + return None + + +def env_from_item(item: dict) -> str: + """Resolve env from task item: 'local' = host, else = docker. Backward compat: run_on_host/docker_env.""" + env = item.get('env', None) + if env is not None: + s = str(env).strip().lower() + return 'local' if s == 'local' else (str(env).strip() or 'docker') + return 'local' if item.get('run_on_host', False) else 'docker' + + +def gpu_from_item(item: dict) -> bool: + """Whether to enable GPU access in Docker. Default False (no host GPU passed to container).""" + return _parse_bool_value(item.get('gpu', False)) + + +def docker_image_from_item( + item: dict, + default: str | None = None, + *, + env: str | None = None, +) -> str | None: + """Resolve Docker image from task item. + + When env is 'local', returns None (no Docker). Otherwise returns, in order: + item['env'] if it looks like an image name, item['docker_env'], or default. + If env is provided (e.g. from env_from_item), avoids parsing env twice. + """ + resolved = (env if env is not None else env_from_item(item)).strip().lower() + if resolved == 'local': + return None + env_val = item.get('env', None) + if env_val is not None: + s = str(env_val).strip() + if s and s.lower() != 'local': + return s + return ( + item.get('docker_env', None) + or item.get('docer_env', None) + or (default or DEFAULT_DOCKER_IMAGE) + ) + + +def get_task(file_path: str) -> str: + """Get agent task from a file path. + + Args: + file_path: Path to README or task description file (relative to artifact root) + + Returns: + Task description string for the agent + """ + return _DEFAULT_TASK_TEMPLATE.format(file_path=file_path) + + +def read_task_from_file(artifact_path: str, task_file: str) -> str: + """Read task description from a file. + + Args: + artifact_path: Path to artifact root directory + task_file: Relative path to task file (e.g., README.md) + + Returns: + Content of the task file as string + """ + task_file_path = os.path.join(artifact_path, task_file) + if os.path.exists(task_file_path): + with open(task_file_path, encoding='utf-8') as f: + return f.read() + else: + return get_task(task_file) + + +def parse_artifact_url(artifact_url: str) -> tuple[str, str | None]: + """Parse artifact URL into (clone_url, branch) for git clone. + + Supports GitHub-style URLs: + - https://github.com/org/repo -> (https://github.com/org/repo.git, None) + - https://github.com/org/repo/tree/branch -> (https://github.com/org/repo.git, branch) + """ + url = (artifact_url or '').strip() + if not url: + return url, None + # .../tree/ or .../tree// + tree_match = re.search(r'^(.*?)/tree/([^/#]+?)/?$', url) + if tree_match: + base, branch = tree_match.group(1), tree_match.group(2).strip() + if not base.endswith('.git'): + base = base.rstrip('/') + '.git' + return base, branch if branch else None + if not url.endswith('.git'): + url = url.rstrip('/') + '.git' + return url, None + + +def clone_artifact_repo(artifact_url: str, target_dir: str, branch: str | None = None) -> str: + """Clone artifact repository from URL into target_dir. + + Args: + artifact_url: Git clone URL (e.g. https://github.com/org/repo or .../repo/tree/branch). + target_dir: Absolute path to the directory to clone into (must not exist or be empty). + branch: Optional branch to clone. If None, parse_artifact_url(artifact_url) is used. + + Returns: + target_dir (artifact root path after clone). + + Raises: + RuntimeError: If git clone fails. + """ + if os.path.exists(target_dir) and os.listdir(target_dir): + return target_dir + if os.path.exists(target_dir): + os.rmdir(target_dir) + clone_url, parsed_branch = parse_artifact_url(artifact_url) + use_branch = branch if branch is not None else parsed_branch + cmd = ['git', 'clone', '--depth', '1'] + if use_branch: + cmd.extend(['-b', use_branch]) + cmd.extend([clone_url, target_dir]) + r = subprocess.run( + cmd, + capture_output=True, + text=True, + timeout=600, + ) + if r.returncode != 0: + raise RuntimeError(f'git clone failed: {r.stderr or r.stdout}') + return target_dir + + +def resolve_project_path(item: dict, input_file: str, save_path: str) -> tuple[str | None, str | None]: + """Resolve artifact project path from task item. + + When both artifact_url and artifact_dir are set, if the local path + (input_dir/artifact_dir) already exists, it is used and no clone is performed. + Otherwise the repo is cloned from artifact_url into save_path/workspace/. + + Returns: + (project_path, error_message). If error_message is not None, skip task. + """ + input_dir = os.path.dirname(os.path.abspath(input_file)) + artifact_dir = item.get('artifact_dir') + artifact_url = item.get('artifact_url') + task_id = item.get('artifact_id') + sid = safe_task_id(task_id) + + if artifact_url: + candidate = os.path.join(input_dir, artifact_dir) if artifact_dir else None + if candidate and os.path.isdir(candidate): + return os.path.abspath(candidate), None + workspace_dir = os.path.join(save_path, 'workspace', sid) + os.makedirs(os.path.dirname(workspace_dir), exist_ok=True) + return clone_artifact_repo(artifact_url, workspace_dir), None + if not artifact_dir: + return None, f'Skipping task {task_id}: missing artifact_dir and artifact_url' + path = os.path.abspath(os.path.join(input_dir, artifact_dir)) + if not os.path.isdir(path): + return None, f'Project path does not exist: {path}' + return path, None + + +class Tee: + """Write to both an original stream and a log file. + + Implements enough of the TextIO interface to serve as a drop-in + replacement for sys.stdout / sys.stderr (supports libraries that + probe encoding, isatty, etc.). + """ + + def __init__(self, stream, log_path: str): + """Wrap stream and log_path for dual write.""" + self._stream = stream + self._path = log_path + self._file = None + + def __enter__(self): + """Open log file and return self.""" + self._file = open(self._path, 'a', encoding='utf-8') + return self + + def __exit__(self, *args): + """Close log file.""" + if self._file: + self._file.close() + + def write(self, data): + """Write to both stream and log file.""" + self._stream.write(data) + if self._file: + self._file.write(data) + self._file.flush() + + def flush(self): + """Flush both stream and log file.""" + self._stream.flush() + if self._file: + self._file.flush() + + @property + def encoding(self) -> str: + """Return underlying stream encoding or utf-8.""" + return getattr(self._stream, 'encoding', 'utf-8') + + def isatty(self) -> bool: + """Return whether underlying stream is a TTY.""" + return getattr(self._stream, 'isatty', lambda: False)() + + def fileno(self) -> int: + """Return underlying stream fileno.""" + return self._stream.fileno() + + +def write_task_report( + save_path: str, + safe_id: str, + task_id: str, + result: dict, + log_path: str, + agent_summary: str, +) -> None: + """Write ae_report_.md for a single task.""" + report_path = os.path.join(save_path, f'ae_report_{safe_id}.md') + saved_image = result.get('saved_image') + with open(report_path, 'w', encoding='utf-8') as fw: + fw.write(f'# AE Report: {task_id}\n\n') + fw.write(f'- **Status**: {result.get("status", "unknown")}\n') + fw.write(f'- **Timestamp**: {result.get("timestamp", "")}\n') + fw.write(f'- **Project path**: {result.get("project_path", "")}\n') + fw.write(f'- **Run on host**: {result.get("run_on_host", False)}\n') + fw.write(f'- **Log file**: `{log_path}`\n\n') + if saved_image: + fw.write('> [!Note]\n') + fw.write('> ## To check the result\n') + fw.write('>\n') + fw.write('> You can run the following command to manually check the result:\n') + fw.write('>\n') + fw.write('> ```bash\n') + fw.write(f'> docker run -it {saved_image} bash\n') + fw.write('> ```\n') + fw.write('>\n') + fw.write(f'> Image: `{saved_image}`\n\n') + fw.write('## Agent summary\n\n') + fw.write(agent_summary) + fw.write('\n') + + +def parse_eval_score(output) -> int: + """Parse evaluation score from evaluator script output (string or object with .output). + + - If a line is a single digit (e.g. '4', '0'), use it (prefer last such line). + - If output contains 'Agent scores: {...}' (Oracle-style evaluator), count ': 1' as passed items. + - Otherwise return 0. + """ + s = (getattr(output, 'output', None) or str(output) or '').strip() + if not s: + return 0 + lines = s.splitlines() + for line in reversed(lines): + t = line.strip() + if t.isdigit(): + return int(t) + m = re.search(r'Agent scores:\s*\{[^}]*\}', s) + if m: + return m.group(0).count(': 1') + return 0 + + +def compute_and_write_summary(save_path: str) -> tuple[int, int]: + """Read result.jsonl, compute total/success, write summary.json. + + total = number of result lines (success + error + skipped). success = status == "success". + Returns (total_count, success_count). + """ + result_path = os.path.join(save_path, 'result.jsonl') + total, success = 0, 0 + if os.path.isfile(result_path): + with open(result_path, encoding='utf-8') as f: + for line in f: + if not line.strip(): + continue + try: + row = json.loads(line.strip()) + total += 1 + if row.get('status') == 'success': + success += 1 + except json.JSONDecodeError: + continue + rate = success / total if total > 0 else 0.0 + summary = {'total_tasks': total, 'successful_tasks': success, 'success_rate': rate} + with open(os.path.join(save_path, 'summary.json'), 'w', encoding='utf-8') as f: + json.dump(summary, f, indent=4) + return total, success diff --git a/benchmarks/arteval_bench/src/main.py b/benchmarks/arteval_bench/src/main.py new file mode 100644 index 00000000..75222211 --- /dev/null +++ b/benchmarks/arteval_bench/src/main.py @@ -0,0 +1,246 @@ +"""This script runs a benchmark for evaluating patches in a software project.""" + +import argparse +import json +import os +import sys +from datetime import datetime + +_src_dir = os.path.dirname(os.path.abspath(__file__)) +if _src_dir not in sys.path: + sys.path.insert(0, _src_dir) +sys.path.append(os.path.abspath(os.path.join(_src_dir, '../../../'))) + +from sdk.logger import logger +from sdk.utils import set_llm_endpoint_from_config + +set_llm_endpoint_from_config('env.toml') + +from run_eval_in_env import run_eval +from utils import get_task + +from agents.ae_agent.utils import ( + enable_skill_from_item, + enable_subagent_from_item, + gpu_from_item, + interactive_from_item, + resolve_project_path, + safe_task_id, + timeout_ms_from_item, + write_task_report, + compute_and_write_summary, +) + + +def _persist_skipped(save_path: str, task_id: str, message: str, expected_score: int = -1) -> None: + """Append one result line for a skipped task so summary total is accurate (same as ae-agent).""" + result = { + 'task_id': task_id, + 'status': 'skipped', + 'message': message, + 'expected_score': expected_score, + } + with open(os.path.join(save_path, 'result.jsonl'), 'a+', encoding='utf-8') as fw: + fw.write(json.dumps(result, ensure_ascii=False) + '\n') + + +def _parse_bool(v, default=False): + if isinstance(v, bool): + return v + if isinstance(v, str): + return v.strip().lower() in ('true', '1', 'yes') + return bool(v) if v is not None else default + + +def _is_ae_agent(agent): + """True if agent path points to the ae_agent (for report/summary writing).""" + if not agent: + return False + return 'ae_agent' in agent or os.path.basename(agent) == 'ae_agent' + + +def main(file_path, model, agent, save_path, interactive_default=False, enable_skill_default=False, enable_subagent_default=False): + """Main function for running the benchmark.""" + logger.info(f'Using model: {model}, agent: {agent}') + with open(file_path) as f: + for line in f: + if not line.strip(): + continue # Skip empty lines + + try: + item = json.loads(line) + except json.JSONDecodeError: + logger.info(f'Skipping invalid JSON line: {line}') + continue + + env_val = item.get('env', None) + if env_val is not None: + s = str(env_val).strip().lower() + if s == 'local': + run_on_host = True + deployment = None + else: + run_on_host = False + deployment = str(env_val).strip() or None + else: + deployment = item.get('docker_env', None) or item.get('docer_env', None) + run_on_host = item.get('run_on_host', False) + task_id = item.get('artifact_id', None) + project_path, path_error = resolve_project_path(item, file_path, save_path) + if path_error: + logger.info(f"Task {task_id}: {path_error}") + _persist_skipped( + save_path, + task_id or safe_task_id(task_id), + path_error, + item.get('expected_score', -1), + ) + continue + task_file = item.get('artifact_readme', None) + test_method = item.get('evaluator', None) + + timeout_ms = timeout_ms_from_item(item) + gpu = gpu_from_item(item) + interactive = interactive_from_item(item) or interactive_default + enable_skill = enable_skill_from_item(item, enable_skill_default) + enable_subagent = enable_subagent_from_item(item, enable_subagent_default) + keep_container = _parse_bool(item.get('keep_container'), False) + + task = get_task(task_file) + + logger.info( + f"Task {task_id}: project_path={project_path}, run_on_host={run_on_host}, " + f"timeout_ms={timeout_ms}, gpu={gpu}, interactive={interactive}, " + f"enable_skill={enable_skill}, enable_subagent={enable_subagent}, keep_container={keep_container}" + ) + + result = run_eval( + deployment=deployment, + project_path=project_path, + task_id=task_id, + task=task, + model=model, + agent_path=agent, + test_method=test_method, + save_path=save_path, + run_on_host=run_on_host, + timeout_ms=timeout_ms, + gpu=gpu, + interactive=interactive, + enable_skill=enable_skill, + enable_subagent=enable_subagent, + keep_container=keep_container, + ) + + result['expected_score'] = item.get('expected_score', -1) + result['timestamp'] = result.get('timestamp') or datetime.now().isoformat() + with open(f'{save_path}/result.jsonl', 'a+', encoding='utf-8') as fw: + fw.write(json.dumps(result, ensure_ascii=False) + '\n') + + # When using ae_agent, also write per-task AE report (same as standalone ae-agent). + if _is_ae_agent(agent): + safe_id = safe_task_id(task_id) + log_path = result.get('log_file') or '(log not captured when run via benchmark)' + agent_summary = (result.get('agent_run_results') or '')[:8000] or '(No summary captured)' + try: + write_task_report(save_path, safe_id, task_id, result, log_path, agent_summary) + except Exception as e: + logger.warning('write_task_report failed: %s', e) + + # Write summary.json (total/success counts) when ae_agent was used. + if _is_ae_agent(agent): + try: + compute_and_write_summary(save_path) + except Exception as e: + logger.warning('compute_and_write_summary failed: %s', e) + + success_count = 0 + total_count = 0 + with open(f'{save_path}/result.jsonl', encoding='utf-8') as f: + for line in f: + result = json.loads(line.strip()) + if result.get('status') == 'success': + success_count += (result.get('score') == result.get('expected_score', -1)) + total_count += 1 + logger.info(f'Test run completed: {success_count}/{total_count} tasks succeeded.') + summary_data = {'final_score': success_count / total_count, 'total_tasks': total_count} + + with open(os.path.join(save_path, 'avg_score.json'), 'w', encoding='utf-8') as summary_file: + json.dump(summary_data, summary_file, indent=4) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='example benchmark') + parser.add_argument( + '-i', + '--input_file', + help='Benchmark input file', + default='./data/benchmark/arteval_tasks.jsonl', + #default='./data/benchmark/env_setup_examples.jsonl', + ) + parser.add_argument('-o', '--save_path', help='Result save path', default=None) + parser.add_argument( + '-a', + '--agent', + help='Agent Name', + default='claudecode', + ) + parser.add_argument( + '-m', + '--model_name', + help='Model Name', + default='claude-sonnet-4-5-20250929', + ) + parser.add_argument( + '--interactive', + action='store_true', + help='Enable interactive mode (continue giving agent instructions after task completes)', + ) + parser.add_argument( + '--enable-skill', + action='store_true', + help='Enable Claude Agent SDK Skill (load from ~/.claude/skills/)', + ) + parser.add_argument( + '--enable-subagent', + action='store_true', + help='Enable Claude Agent SDK Sub-agent (Task tool)', + ) + # Note that if your benchmark has multiple tasks, you need to add --task + # in your code to enable task selection. + parser.add_argument('-t', '--task', help='specify task in scenarios', default=None) + + args = parser.parse_args() + + model_name = args.model_name + agent = args.agent + input_file = args.input_file + save_path = args.save_path + task = args.task + + logger.debug(f"Benchmark path: {input_file}") + + if save_path is None: + str_model_name = model_name.replace('/', '_') + timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S') + save_path = os.path.join('./outputs', f'env_setup_project__{str_model_name}__{args.agent}__{timestamp}') + + _src_dir = os.path.dirname(os.path.abspath(__file__)) + if agent == 'claudecode': + agent = os.path.join(_src_dir, 'agents', 'claudecode') + elif agent == 'claude_sdk': + agent = os.path.join(_src_dir, 'agents', 'claude_sdk') + elif agent == 'ae_agent' or agent == 'ae-agent': + agent = os.path.join(_src_dir, 'agents', 'ae_agent') + save_path = os.path.abspath(os.path.expanduser(save_path)) + os.makedirs(save_path, exist_ok=True) + + main( + input_file, + model_name, + agent, + save_path, + interactive_default=getattr(args, 'interactive', False), + enable_skill_default=getattr(args, 'enable_skill', False), + enable_subagent_default=getattr(args, 'enable_subagent', False), + ) diff --git a/benchmarks/arteval_bench/src/run_eval_in_env.py b/benchmarks/arteval_bench/src/run_eval_in_env.py new file mode 100644 index 00000000..afc5a22b --- /dev/null +++ b/benchmarks/arteval_bench/src/run_eval_in_env.py @@ -0,0 +1,845 @@ +"""Patch evaluator for running tests in a deployment.""" + +import asyncio +import json +import os +import re +import subprocess +import sys +import tempfile +import shutil +from pathlib import Path + +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../../../'))) + +from swerex.deployment.docker import DockerDeploymentConfig +from swerex.runtime.abstract import BashAction, Command, CreateBashSessionRequest, UploadRequest + +from sdk.logger import logger + + +def _parse_eval_score(output) -> int: + """Parse evaluation score from BashObservation or string output. + + - If a line is a single digit (e.g. '4', '0'), use it (prefer last such line). + - If output contains 'Agent scores: {...}' (Oracle-style evaluator), count ': 1' as passed items. + - Otherwise return 0. + """ + s = (getattr(output, "output", None) or str(output) or "").strip() + if not s: + return 0 + lines = s.splitlines() + for line in reversed(lines): + t = line.strip() + if t.isdigit(): + return int(t) + m = re.search(r"Agent scores:\s*\{[^}]*\}", s) + if m: + return m.group(0).count(": 1") + return 0 + + +def write_to_file(file_path, content): + """Write content to a file.""" + with open(file_path, 'w') as f: + f.write(content) + + +def setup_claude_settings_on_host(): + """Set up ~/.claude/settings.json with timeout configuration on host.""" + claude_dir = Path.home() / ".claude" + settings_file = claude_dir / "settings.json" + + claude_dir.mkdir(exist_ok=True) + + settings = { + "env": { + "BASH_MAX_TIMEOUT_MS": "345600000", # 96 hours + "BASH_DEFAULT_TIMEOUT_MS": "345600000" + } + } + + with open(settings_file, 'w') as f: + json.dump(settings, f, indent=2) + + logger.info(f"Created {settings_file} with 96-hour timeout configuration.") + + +def _is_ae_agent_path(agent_path) -> bool: + """True if agent_path points to the ae_agent agent (same flow: agent + evaluation script).""" + if not agent_path: + return False + p = (agent_path or "").rstrip(os.sep) + return p.endswith("ae_agent") or os.path.basename(p) == "ae_agent" + + +def _stdin_is_tty() -> bool: + """True if stdin is a TTY (required for docker exec -it).""" + return getattr(sys.stdin, "isatty", lambda: False)() + + +async def _get_container_id_from_runtime(runtime, deployment) -> str: + """Get Docker container ID from inside the container (hostname/cgroup) or from deployment.""" + container_id = "unknown" + try: + res = await runtime.run_in_session( + BashAction(command='cat /etc/hostname 2>/dev/null || hostname 2>/dev/null || echo "unknown"', timeout=10.0) + ) + container_id = str(getattr(res, "output", "")).strip() + try: + cgroup_res = await runtime.run_in_session( + BashAction(command='cat /proc/self/cgroup 2>/dev/null | grep docker | head -1 | cut -d/ -f3 | cut -c1-12 || echo ""', timeout=10.0) + ) + cid = str(getattr(cgroup_res, "output", "")).strip() + if cid: + container_id = cid + except Exception: + pass + if hasattr(deployment, '_container_id') and getattr(deployment, '_container_id', None): + container_id = deployment._container_id + elif hasattr(deployment, 'container_id') and getattr(deployment, 'container_id', None): + container_id = deployment.container_id + except Exception as e: + logger.warning('Failed to get container ID: %s', e) + return container_id + + +async def _run_ae_agent_interactive_foreground( + container_id: str, + model: str, + timeout_ms: int | None, + enable_skill: bool, + enable_subagent: bool, +): + """Run ae_agent runner in foreground via docker exec -it (interactive mode). Returns MockResult with exit_code.""" + try: + from agents.ae_agent.utils import resolve_timeout_ms + from agents.ae_agent.run_eval import _docker_exec_env_args + except ImportError: + _src = os.path.dirname(os.path.abspath(__file__)) + if _src not in sys.path: + sys.path.insert(0, _src) + from agents.ae_agent.utils import resolve_timeout_ms + from agents.ae_agent.run_eval import _docker_exec_env_args + + timeout_resolved = resolve_timeout_ms(timeout_ms) + exec_env = _docker_exec_env_args( + timeout_resolved, + enable_skill=enable_skill, + enable_subagent=enable_subagent, + ) + exec_args = ( + ['docker', 'exec', '-it'] + + exec_env + + [container_id, 'python3', '-u', '/agent/runner.py', model, '/agent/current_task.txt', '--interactive'] + ) + logger.info('Running ae_agent in interactive mode (foreground): docker exec -it %s ...', container_id[:12]) + proc = await asyncio.to_thread( + subprocess.run, + exec_args, + stdin=sys.stdin, + stdout=sys.stdout, + stderr=sys.stderr, + ) + exit_code = proc.returncode if proc else -1 + + class MockResult: + def __init__(self, code, output=''): + self.exit_code = code + self.output = output or f'exit_code={code}' + + return MockResult(exit_code, f'Interactive session (exit_code={exit_code})') + + +async def run_eval_on_host( + project_path, + task_id, + task, + model, + agent_path, + test_method, + save_path, + timeout_ms=None, + interactive=False, + enable_skill=False, + enable_subagent=False, +): + """Run evaluation directly on host machine (no Docker container). + + When agent is ae_agent, delegates to ae_agent.run_agent_then_eval (agent run + evaluation script), + same flow as claude_sdk. Otherwise uses inline Claude SDK + test_method. + """ + logger.info("=" * 80) + logger.info("Running evaluation directly on HOST MACHINE (not in Docker)") + logger.info("=" * 80) + + if _is_ae_agent_path(agent_path): + logger.info("Using ae_agent flow: run agent then evaluation script.") + try: + from agents.ae_agent.run_eval import _run_agent_then_eval_async + except ImportError: + _src = os.path.dirname(os.path.abspath(__file__)) + if _src not in sys.path: + sys.path.insert(0, _src) + from agents.ae_agent.run_eval import _run_agent_then_eval_async + result = await _run_agent_then_eval_async( + project_path=project_path, + task_id=task_id, + task=task, + model=model, + test_method=test_method, + save_path=save_path, + timeout_ms=timeout_ms, + skip_prereq_check=False, + interactive=interactive, + enable_skill=enable_skill, + enable_subagent=enable_subagent, + ) + return result + + # Original flow: inline Claude SDK then test_method (e.g. claude_sdk or default) + import shutil + + if not shutil.which("docker"): + raise RuntimeError("Docker is not installed on host") + + result = subprocess.run(["docker", "ps"], capture_output=True, timeout=10) + if result.returncode != 0: + raise RuntimeError("Docker is not running on host") + + if not os.environ.get("ANTHROPIC_API_KEY"): + raise RuntimeError("ANTHROPIC_API_KEY environment variable is not set") + + setup_claude_settings_on_host() + + project_path = os.path.abspath(project_path) + if not os.path.isdir(project_path): + raise RuntimeError(f"Project path does not exist: {project_path}") + + logger.info(f"Project path: {project_path}") + logger.info(f"Task ID: {task_id}") + logger.info(f"Model: {model}") + + try: + from claude_agent_sdk import query, ClaudeAgentOptions + except ImportError as e: + raise RuntimeError(f"claude_agent_sdk not installed: {e}. Install with: pip install claude-agent-sdk") + + system_prompt = f"""You are an experienced software engineer completing an artifact evaluation task. + +ENVIRONMENT SETUP (HOST MACHINE - NOT DOCKER): +- You are running DIRECTLY on the host machine (NOT inside a Docker container) +- Docker daemon is already running on this host +- When you use Kind to create Kubernetes clusters, they will be created using the host's Docker +- This avoids Docker-in-Docker compatibility issues +- You may need sudo for some operations + +ARTIFACT LOCATION: +- The artifact repository is located at: {project_path} +- Start by changing to this directory: cd {project_path} + +YOUR TASK: +{task} + +TIMEOUT CONFIGURATION (CRITICAL): +- Long-running commands (builds, tests, Kind cluster creation) are expected +- DO NOT set short timeouts - let commands complete naturally +- Kind cluster creation can take 5-10 minutes +- Full benchmark runs can take hours + +IMPORTANT GUIDELINES: +1. First, cd to {project_path} and examine the directory structure +2. Follow the README instructions step by step +3. If you see 'sudo' in instructions, you can use it (or skip if already root) +4. Use the Bash tool to run commands, Read tool to inspect files +5. Work systematically through setup, build, and experiment execution +6. If you encounter errors, debug and resolve them using available tools +7. For Kind clusters, they will work properly since you're on the host (not DinD)""" + + options = ClaudeAgentOptions( + system_prompt=system_prompt, + allowed_tools=["Read", "Write", "Bash"], + setting_sources=["user"], + ) + + os.environ['BASH_MAX_TIMEOUT_MS'] = '345600000' + os.environ['BASH_DEFAULT_TIMEOUT_MS'] = '345600000' + + logger.info("Starting Claude Agent SDK (Host Mode)...") + + message_count = 0 + run_results_output = "" + + try: + async for message in query( + prompt=f"Please start the artifact evaluation task. Begin by changing to the artifact directory at {project_path} and examining its contents.", + options=options + ): + message_count += 1 + if message_count % 10 == 0: + logger.info(f"[Progress] Processed {message_count} messages...") + msg_str = str(message) + logger.info(msg_str) + if 'ResultMessage' in msg_str or 'TextBlock' in msg_str: + run_results_output = msg_str + logger.info(f"Claude Agent SDK execution completed. Total messages: {message_count}") + except Exception as e: + logger.error(f"Claude Agent SDK execution failed: {e}") + import traceback + traceback.print_exc() + run_results_output = f"Error: {e}" + + logger.info("Running evaluation script...") + try: + eval_cmd = f"cd {project_path} && {test_method}" + eval_result = subprocess.run( + eval_cmd, + shell=True, + capture_output=True, + text=True, + timeout=300 + ) + test_output = eval_result.stdout.strip() + logger.info(f"Evaluation output: {test_output}") + result = { + 'task_id': task_id, + 'task': task, + 'project_path': project_path, + 'agent_run_results': run_results_output, + 'test_method': test_method, + 'score': _parse_eval_score(test_output), + 'status': 'success', + 'run_on_host': True, + } + except Exception as e: + logger.error(f"Error running test method: {e}") + result = { + 'task_id': task_id, + 'task': task, + 'project_path': project_path, + 'agent_run_results': run_results_output, + 'test_method': test_method, + 'score': 0, + 'status': f'error: {str(e)}', + 'run_on_host': True, + } + + return result + + +async def run_eval_in_env( + deployment, + project_path, + task_id, + task, + model, + agent_path, + test_method, + save_path, + timeout_ms=None, + gpu=False, + interactive=False, + enable_skill=False, + enable_subagent=False, + keep_container=True, +): + """Spoiler: This function will work with any deployment.""" + await deployment.start() + runtime = deployment.runtime + + # Default 96h when timeout_ms not provided + runner_timeout_sec = (timeout_ms / 1000.0) if timeout_ms is not None else 345600.0 + if hasattr(runtime, "_config"): + logger.info(f"Current RemoteRuntime timeout: {runtime._config.timeout}s") + runtime._config.timeout = runner_timeout_sec + logger.info(f"Overriding RemoteRuntime timeout to {runtime._config.timeout}s") + + # Issue a few one-off commands, similar to `subprocess.run()` + logger.info(await runtime.execute(Command(command=['echo', 'Hello, world!']))) + + # Create a bash session + await runtime.create_session(CreateBashSessionRequest()) + # Run a command in the session + # The difference to the one-off commands is that environment state persists! + logger.info(await runtime.run_in_session(BashAction(command="export MYVAR='test'"))) + logger.info(await runtime.run_in_session(BashAction(command='echo $MYVAR'))) + + logger.info('Uploading project files...') + logger.info( + await runtime.upload( + UploadRequest( + source_path=project_path, + target_path='/repo', + ) + ) + ) + logger.info('Project files uploaded.') + + # Long-running agents (claude_sdk, ae_agent): remove eval script dirs so the agent cannot see evaluation logic + is_claude_sdk = str(agent_path).endswith('claude_sdk') + is_ae_agent = str(agent_path).endswith('ae_agent') + is_long_running_agent = is_claude_sdk or is_ae_agent + agent_label = 'ae_agent' if is_ae_agent else 'claude_sdk' + if is_long_running_agent: + logger.info(f'Removing _agent_eval directories for {agent_label} to prevent answer leakage...') + await runtime.run_in_session( + BashAction(command='find /repo -type d -name "_agent_eval" -exec rm -rf {} + 2>/dev/null || true', timeout=30.0) + ) + logger.info('_agent_eval directories removed.') + + run_results = await runtime.run_in_session(BashAction(command='cd /repo')) + logger.info(run_results) + run_results = await runtime.run_in_session(BashAction(command='pwd')) + logger.info(f'Current directory: {run_results}') + run_results = await runtime.run_in_session(BashAction(command='ls')) + logger.info(f'Current directory contents: {run_results}') + + logger.info('Uploading agent runner script...') + logger.info( + await runtime.upload( + UploadRequest( + source_path=agent_path, + target_path='/agent', + ) + ) + ) + logger.info(await runtime.run_in_session(BashAction(command='ls /agent/runner.sh'))) + logger.info('Agent runner script uploaded.') + + logger.info('Setup the agent running environment...') + logger.info(await runtime.run_in_session(BashAction(command='chmod +x /agent/runner.sh /agent/install.sh'))) + logger.info(await runtime.run_in_session(BashAction(command='cat /agent/runner.sh'))) + logger.info(await runtime.run_in_session(BashAction(command='/agent/install.sh'))) + + # Set required env vars for long-running agents (passed from host into container) + if is_long_running_agent: + parts = [] + anthropic_api_key = os.environ.get('ANTHROPIC_API_KEY') + foundry_api_key = os.environ.get('ANTHROPIC_FOUNDRY_API_KEY') + if anthropic_api_key: + escaped_key = anthropic_api_key.replace("'", "'\"'\"'") + parts.append(f"export ANTHROPIC_API_KEY='{escaped_key}'") + if foundry_api_key: + escaped_foundry = foundry_api_key.replace("'", "'\"'\"'") + parts.append(f"export ANTHROPIC_FOUNDRY_API_KEY='{escaped_foundry}'") + if not anthropic_api_key: + parts.append(f"export ANTHROPIC_API_KEY='{escaped_foundry}'") + foundry_base = os.environ.get('ANTHROPIC_FOUNDRY_BASE_URL') + if foundry_base: + escaped_url = foundry_base.replace("'", "'\"'\"'") + parts.append(f"export ANTHROPIC_FOUNDRY_BASE_URL='{escaped_url}'") + if os.environ.get('CLAUDE_CODE_USE_FOUNDRY') == '1': + parts.append("export CLAUDE_CODE_USE_FOUNDRY=1") + if enable_skill: + parts.append("export AE_ENABLE_SKILL=1") + if enable_subagent: + parts.append("export AE_ENABLE_SUBAGENT=1") + if parts: + set_env_cmd = " && ".join(parts) + logger.info('Setting Anthropic/Foundry API key and env in container...') + logger.info(await runtime.run_in_session(BashAction(command=set_env_cmd))) + if not anthropic_api_key and not foundry_api_key: + logger.warning('Neither ANTHROPIC_API_KEY nor ANTHROPIC_FOUNDRY_API_KEY found. Runner may fail.') + + # For ae_agent: upload task to /agent/current_task.txt to avoid shell quoting with large tasks + if is_ae_agent: + tmpdir = tempfile.mkdtemp(prefix='ae_agent_task_') + try: + task_file_host = os.path.join(tmpdir, 'current_task.txt') + with open(task_file_host, 'w', encoding='utf-8') as f: + f.write(task) + await runtime.upload(UploadRequest(source_path=tmpdir, target_path='/agent_task_file')) + await runtime.run_in_session(BashAction(command='cp /agent_task_file/current_task.txt /agent/current_task.txt', timeout=10.0)) + finally: + shutil.rmtree(tmpdir, ignore_errors=True) + logger.info('Task file uploaded to /agent/current_task.txt for ae_agent.') + + logger.info('Running runner script...') + if timeout_ms is not None: + runner_timeout = timeout_ms / 1000.0 + else: + runner_timeout = 345600.0 if is_long_running_agent else 1200.0 # 96h for long-running agents + + run_results = None + # Docker + interactive: run ae_agent in foreground via docker exec -it (same as standalone ae-agent). + if is_ae_agent and interactive and _stdin_is_tty(): + container_id_early = await _get_container_id_from_runtime(runtime, deployment) + if container_id_early and container_id_early != "unknown": + try: + run_results = await _run_ae_agent_interactive_foreground( + container_id_early, model, timeout_ms, enable_skill, enable_subagent + ) + logger.info('ae_agent interactive session finished with exit_code=%s', run_results.exit_code) + except Exception as e: + logger.warning('ae_agent interactive foreground failed: %s', e) + else: + logger.warning('Cannot get container ID for interactive mode; falling back to non-interactive.') + + if run_results is None: + if is_long_running_agent: + # Live log monitoring: run runner in background, poll log file periodically + await runtime.run_in_session(BashAction(command='rm -f /agent/runner.live.log && touch /agent/runner.live.log', timeout=10.0)) + + # ae_agent: use task file to avoid shell quoting; others pass task string + if is_ae_agent: + start_cmd = ( + 'stdbuf -oL -eL /agent/runner.sh "' + model + '" /agent/current_task.txt > /agent/runner.live.log 2>&1 & ' + 'RUNNER_PID=$!; ' + 'sleep 1; ' + 'echo RUNNER_PID=$RUNNER_PID' + ) + else: + start_cmd = ( + f'bash -c "stdbuf -oL -eL /agent/runner.sh \\"{model}\\" \\"{task}\\" > /agent/runner.live.log 2>&1 & ' + 'RUNNER_PID=$!; ' + 'sleep 1; ' + 'echo RUNNER_PID=$RUNNER_PID"' + ) + start_res = await runtime.run_in_session(BashAction(command=start_cmd, timeout=30.0)) + start_output = str(getattr(start_res, "output", "")).strip() + + pid = None + for line in start_output.split('\n'): + if 'RUNNER_PID=' in line: + pid = line.split('RUNNER_PID=', 1)[1].strip() + break + + if not pid or not pid.isdigit(): + # Fallback: find PID by process name after short delay + await asyncio.sleep(2) + ps_res = await runtime.run_in_session( + BashAction(command="ps aux | grep '[r]unner.py' | awk '{print $2}' | head -1", timeout=10.0) + ) + pid = str(getattr(ps_res, "output", "")).strip() + + logger.info(f'{agent_label} runner started with pid: {pid}') + + await asyncio.sleep(2) # Allow log file to have content + + elapsed = 0.0 + poll_interval = 10.0 # Poll every 10s for live log + run_results = None + last_log_content = "" # Track last read content to avoid duplicate output + + while elapsed < runner_timeout: + try: + log_res = await runtime.run_in_session( + BashAction(command='cat /agent/runner.live.log 2>/dev/null || echo ""', timeout=30.0) + ) + current_log_content = str(getattr(log_res, "output", "")).strip() + + if current_log_content and current_log_content != last_log_content: + if last_log_content and current_log_content.startswith(last_log_content): + new_content = current_log_content[len(last_log_content):].strip() + if new_content: + logger.info(f'[{agent_label} live log @ {elapsed:.0f}s ({elapsed/60:.1f} min)]\n{new_content}') + else: + logger.info(f'[{agent_label} live log @ {elapsed:.0f}s ({elapsed/60:.1f} min)]\n{current_log_content}') + last_log_content = current_log_content + elif elapsed % 300 == 0 and elapsed > 0: + logger.info(f'[{agent_label} still running @ {elapsed:.0f}s ({elapsed/60:.1f} min), no new output]') + except Exception as e: + logger.info(f'Failed to read {agent_label} live log: {e}') + + if pid and pid.isdigit(): + ps_res = await runtime.run_in_session( + BashAction(command=f'ps -p {pid} >/dev/null 2>&1; echo $?', timeout=10.0) + ) + ps_code = str(getattr(ps_res, "output", "")).strip() + if ps_code != "0": + wait_res = await runtime.run_in_session( + BashAction(command=f'wait {pid} 2>/dev/null; echo $?', timeout=30.0) + ) + exit_code_str = str(getattr(wait_res, "output", "")).strip() + + class MockResult: + def __init__(self, code): + self.exit_code = int(code) if code.isdigit() else 0 + self.output = f'exit_code={self.exit_code}' + run_results = MockResult(exit_code_str) + logger.info(f'{agent_label} runner finished with exit code: {run_results.exit_code}') + break + else: + ps_res = await runtime.run_in_session( + BashAction(command="ps aux | grep '[r]unner.py' | wc -l", timeout=10.0) + ) + proc_count = str(getattr(ps_res, "output", "")).strip() + if proc_count == "0" or not proc_count.isdigit() or int(proc_count) == 0: + logger.info(f'{agent_label} runner process not found, assuming finished') + class MockResult: + def __init__(self): + self.exit_code = 0 + self.output = 'exit_code=0' + run_results = MockResult() + break + + await asyncio.sleep(poll_interval) + elapsed += poll_interval + + if run_results is None: + # Timeout: try to kill process and capture final log + if pid and pid.isdigit(): + try: + await runtime.run_in_session(BashAction(command=f'kill -TERM {pid} 2>/dev/null || kill -9 {pid} 2>/dev/null || true', timeout=10.0)) + except Exception: + pass + try: + tail_log = await runtime.run_in_session( + BashAction(command='tail -n 200 /agent/runner.live.log', timeout=30.0) + ) + logger.info(f'{agent_label} live log tail (on timeout):\n{tail_log}') + except Exception as e: + logger.info(f'Failed to read {agent_label} live log after timeout: {e}') + raise TimeoutError(f'{agent_label} runner exceeded timeout {runner_timeout}s') + + else: + runner_cmd = f'/agent/runner.sh "{model}" "{task}"' + run_results = await runtime.run_in_session(BashAction(command=runner_cmd, timeout=runner_timeout)) + logger.info(f"agent's run results: {run_results}") + logger.info('Runner script finished.') + + # For long-running agents: upload eval scripts before running evaluation + if is_long_running_agent: + logger.info(f'Uploading _agent_eval directories for evaluation ({agent_label})...') + eval_dirs = [] + for root, dirs, files in os.walk(project_path): + if '_agent_eval' in dirs: + eval_source_path = os.path.join(root, '_agent_eval') + rel_path = os.path.relpath(eval_source_path, project_path) + eval_dirs.append((eval_source_path, rel_path)) + + if eval_dirs: + for eval_source_path, rel_path in eval_dirs: + target_eval_path = os.path.join('/repo', rel_path) + logger.info(f'Uploading _agent_eval from {eval_source_path} to {target_eval_path}') + try: + await runtime.upload( + UploadRequest( + source_path=eval_source_path, + target_path=target_eval_path, + ) + ) + logger.info(f'_agent_eval directory uploaded: {rel_path}') + except Exception as e: + logger.warning(f'Failed to upload _agent_eval from {eval_source_path}: {e}') + logger.info('All _agent_eval directories uploaded for evaluation.') + else: + logger.warning(f'No _agent_eval directories found in {project_path}') + + # Run evaluator: JSONL evaluator is a path to main.py (e.g. sosp23_acto/_agent_eval/main.py); + # must run from /repo with `python ` so the script is executed correctly. + if test_method.strip().endswith('.py'): + eval_cmd = f"cd /repo && python {test_method.strip()}" + else: + eval_cmd = f"cd /repo && {test_method}" + try: + test_output = await runtime.run_in_session(BashAction(command=eval_cmd)) + logger.info(test_output) + result = { + 'task': task, + 'project_path': project_path, + 'agent_run_results': run_results.output if hasattr(run_results, 'output') else str(run_results), + 'test_method': test_method, + 'score': _parse_eval_score(test_output), + 'status': 'success', + } + except Exception as e: + logger.info(f'Error running test method: {e}') + result = { + 'task': task, + 'project_path': project_path, + 'agent_run_results': run_results.output if hasattr(run_results, 'output') else str(run_results), + 'test_method': test_method, + 'score': 0, + 'status': f'error: {str(e)}', + } + + # For long-running agents: sync+stop (when keep_container=False) or keep container for inspection + if is_long_running_agent: + container_id = await _get_container_id_from_runtime(runtime, deployment) + container_name = ( + getattr(deployment, '_container_name', None) + or getattr(deployment, 'container_name', None) + or 'unknown' + ) + + if is_ae_agent and not keep_container and container_id and container_id != "unknown": + # Original artifact-agent behavior: sync workspace, commit image, stop container + try: + from agents.ae_agent.run_eval import save_container_after_run + saved_image, container_stopped = save_container_after_run(container_id, project_path, task_id) + result['saved_image'] = saved_image + result['container_stopped'] = container_stopped + result['container_id'] = container_id + result['container_kept'] = False + logger.info(f'ae_agent: synced workspace, saved image={saved_image}, stopped={container_stopped}') + except Exception as e: + logger.warning(f'save_container_after_run failed: {e}') + result['container_id'] = container_id + result['container_kept'] = True + try: + await deployment.stop() + except Exception as e: + logger.warning(f'deployment.stop() failed: {e}') + elif keep_container: + logger.info('=' * 80) + logger.info(f'Keeping Docker container running for {agent_label} (for debugging purposes).') + logger.info(f'Container ID: {container_id}') + logger.info(f'Task ID: {task_id}') + logger.info(f'Project Path: {project_path}') + logger.info(f' To inspect: docker exec -it {container_id} /bin/bash') + logger.info(f' To stop: docker stop {container_id}') + logger.info('=' * 80) + result['container_id'] = container_id + result['container_name'] = container_name + result['container_kept'] = True + else: + await deployment.stop() + result['container_id'] = container_id + result['container_kept'] = False + else: + await deployment.stop() + result['container_kept'] = False + + + return result + + +def run_eval( + deployment, + project_path, + task_id, + task, + model, + agent_path, + test_method, + save_path, + run_on_host=False, + timeout_ms=None, + gpu=False, + interactive=False, + enable_skill=False, + enable_subagent=False, + keep_container=True, +): + """Run evaluation either on host or in Docker container. + + Args: + deployment: Docker image to use (ignored if run_on_host=True) + project_path: Path to the artifact project + task_id: Task identifier + task: Task description + model: Model name + agent_path: Path to agent scripts + test_method: Evaluation command + save_path: Path to save results + run_on_host: If True, run directly on host machine instead of Docker + timeout_ms: Per-task timeout in milliseconds (None = default 96h for long-running agents) + gpu: If True, pass --gpus all to Docker (Docker mode only) + interactive: If True, enable interactive mode after task (ae_agent only) + enable_skill: If True, enable Claude Agent SDK Skill (ae_agent only) + enable_subagent: If True, enable Claude Agent SDK Sub-agent (ae_agent only) + keep_container: If False and ae_agent, sync workspace + commit image + stop container after run + """ + + if run_on_host: + logger.info(f"Task {task_id} configured to run on HOST machine (run_on_host=True)") + return asyncio.run( + run_eval_on_host( + project_path, + task_id, + task, + model, + agent_path, + test_method, + save_path, + timeout_ms=timeout_ms, + interactive=interactive, + enable_skill=enable_skill, + enable_subagent=enable_subagent, + ) + ) + + # Run in Docker container + image = deployment or 'bastoica/ae-agent-ubuntu24.04:latest' + + docker_args = [ + '--privileged', + '--cgroupns=host', + '-e', 'KIND_EXPERIMENTAL_CONTAINERD_SNAPSHOTTER=native', + ] + if gpu: + docker_args.extend(['--gpus', 'all']) + + config = DockerDeploymentConfig( + image=image, + startup_timeout=1200.0, + docker_args=docker_args, + ) + deployment_obj = config.get_deployment() + + return asyncio.run( + run_eval_in_env( + deployment_obj, + project_path, + task_id, + task, + model, + agent_path, + test_method, + save_path, + timeout_ms=timeout_ms, + gpu=gpu, + interactive=interactive, + enable_skill=enable_skill, + enable_subagent=enable_subagent, + keep_container=keep_container, + ) + ) + + + +def test(): + task = 'The java is not installed. Can you please setup it? Note: you are in a docker with root permission. DO NOT use sudo.' + project_path = '../data/benchmark/projects/test-repo' + test_method = 'java -version' + deployment = 'xuafeng/swe-go-python:latest' + model = 'claude-sonnet-4-5-20250929' + agent_path = './agents/claudecode' + save_path = './eval_results' + task_id = 'test_task_1' + result = run_eval(deployment, project_path, task_id, task, model, agent_path, test_method, save_path) + print('Test result:', result) + + +# TODO: still work on add openhand agent +def test1(): + task = 'The java is not installed. Can you please setup it? Note: you are in a docker with root permission. DO NOT use sudo.' + project_path = '../data/benchmark/projects/test-repo' + test_method = 'java -version' + deployment = 'xuafeng/swe-go-python:latest' + model = 'claude-sonnet-4-5-20250929' + agent_path = './agents/openhand' + save_path = './eval_results' + task_id = 'test_task_1' + result = run_eval(deployment, project_path, task_id, task, model, agent_path, test_method, save_path) + print('Test result:', result) + + +def test2(): + task = "create a python file named hello.py that prints 'hello world'" + project_path = '../data/benchmark/projects/test-repo' + test_method = 'python hello.py' + deployment = 'xuafeng/swe-go-python:latest' + model = 'claude-sonnet-4-5-20250929' + agent_path = './agents/claudecode' + save_path = './eval_results' + task_id = 'test_task_1' + eval_out = asyncio.run( + run_eval_in_env(deployment, project_path, task_id, task, model, agent_path, test_method, save_path) + ) + print(eval_out) + + +if __name__ == '__main__': + test1() diff --git a/benchmarks/arteval_bench/src/utils.py b/benchmarks/arteval_bench/src/utils.py new file mode 100644 index 00000000..56bc657f --- /dev/null +++ b/benchmarks/arteval_bench/src/utils.py @@ -0,0 +1,4 @@ +"""Re-export get_task for main.py when run from benchmark root (python src/main.py).""" +from core.utils import get_task + +__all__ = ["get_task"] diff --git a/sdk/utils.py b/sdk/utils.py index cbd79357..995fdfaf 100644 --- a/sdk/utils.py +++ b/sdk/utils.py @@ -62,22 +62,37 @@ def set_llm_endpoint_from_config(config_path): logger.warning(' - %s', key) logger.warning('Only [evaluator_api_keys] values will be used for both evaluator and model under test.') - # First, set environment variables from [llm] + # Placeholder values that should not override an existing env var (e.g. from export) + _placeholders = frozenset({'', 'xxx', 'sk-xxxx', 'sk-xxx', 'xxx'}) + + def _is_placeholder(val): + if val is None: + return True + s = str(val).strip().lower() + return not s or s in _placeholders or s.startswith('sk-xxx') + + # First, set environment variables from [llm] (do not overwrite existing non-placeholder env) logger.info('Setting the following environment variables from [llm]:') for key, value in llm_config.items(): + if _is_placeholder(value) and os.environ.get(key) and not _is_placeholder(os.environ.get(key)): + logger.info('%s: (keeping existing env)', key) + continue logger.info('%s', f'{key}: [REDACTED]' if 'key' in key.lower() else f'{key}: {value}') - os.environ[key] = value + os.environ[key] = str(value) # add exception for SWE-Agent: if key == 'AZURE_API_KEY': - os.environ['AZURE_OPENAI_API_KEY'] = value + os.environ['AZURE_OPENAI_API_KEY'] = str(value) logger.info('AZURE_OPENAI_API_KEY: [REDACTED]') # Then, set environment variables from [evaluator_api_keys] (will override [llm] if conflict) logger.info('Setting the following environment variables from [evaluator_api_keys]:') for key, value in evaluator_config.items(): + if _is_placeholder(value) and os.environ.get(key) and not _is_placeholder(os.environ.get(key)): + logger.info('%s: (keeping existing env)', key) + continue logger.info('%s', f'{key}: [REDACTED]' if 'key' in key.lower() else f'{key}: {value}') - os.environ[key] = value + os.environ[key] = str(value) # add exception for SWE-Agent: if key == 'AZURE_API_KEY': - os.environ['AZURE_OPENAI_API_KEY'] = value + os.environ['AZURE_OPENAI_API_KEY'] = str(value) logger.info('AZURE_OPENAI_API_KEY: [REDACTED]')