diff --git a/benchmarks/arteval_bench/.gitignore b/benchmarks/arteval_bench/.gitignore
index 64b18d31..8de6bf48 100644
--- a/benchmarks/arteval_bench/.gitignore
+++ b/benchmarks/arteval_bench/.gitignore
@@ -117,3 +117,6 @@ a.out
 # Build directories
 build/
 cmake-build-*/
+
+# Duplicate task list copies (canonical: arteval_tasks.jsonl)
+data/benchmark/arteval_tasks copy*.jsonl
diff --git a/benchmarks/arteval_bench/README.md b/benchmarks/arteval_bench/README.md
index bfc30f77..84cc1b78 100644
--- a/benchmarks/arteval_bench/README.md
+++ b/benchmarks/arteval_bench/README.md
@@ -182,5 +182,67 @@ The benchmark supports multiple AI agents:
 - **Claude Code**: Anthropic's code assistant
 - **Mini SWE Agent**: The compact version of [SWE-agent](https://github.com/SWE-agent) assistant
 - **OpenHands**: Open-source coding agent
+- **ae_agent**: Claude Agent SDK–based agent (same logic as the standalone [artifact-agent](https://github.com/sys-intelligence/artifact-agent) repo), with full support for host/Docker, interactive mode, Skill, Sub-agent, per-task timeout, GPU, and optional container sync/commit/stop.
 
 To add your own agent to the benchmark, see [add_agents.md](add_agents.md).
+
+#### » ae_agent usage and options
+
+When using the **ae_agent** (`-a ae_agent` or `-a ae-agent`), you can pass the following from the command line and/or the task JSONL.
+
+**Command-line arguments**
+
+| Argument | Description |
+|----------|-------------|
+| `-i`, `--input_file` | Input JSONL file with tasks (default: `./data/benchmark/arteval_tasks.jsonl`). |
+| `-o`, `--save_path` | Directory for results (default: `./outputs/ae_<model>_ae-agent_<timestamp>`). |
+| `-a`, `--agent` | Agent name; use `ae_agent` or `ae-agent` for this agent. |
+| `-m`, `--model_name` | Model name (e.g. `claude-sonnet-4-5-20250929`). |
+| `--interactive` | After the task completes, keep a session open so you can give more instructions (requires a TTY). In Docker mode the runner is executed in the foreground via `docker exec -it`. |
+| `--enable-skill` | Enable Claude Agent SDK Skill (load from `~/.claude/skills/` and `.claude/skills/`). |
+| `--enable-subagent` | Enable Claude Agent SDK Sub-agent (Task tool). |
+
+**JSONL task fields (per line)**
+
+| Field | Description |
+|-------|-------------|
+| `artifact_id` | Unique task identifier. |
+| `artifact_dir` | Artifact directory name (relative to the JSONL file’s directory). |
+| `artifact_readme` | Path to the README or task description file (relative to artifact root). |
+| `artifact_url` | Optional. Git clone URL; used when `artifact_dir` is missing or the path does not exist. |
+| `env` | `"local"` for host; Docker image name (e.g. `bastoica/ae-agent-ubuntu24.04:latest`) for Docker. |
+| `evaluator` | Command to run after the agent (e.g. `python _agent_eval/main.py`). |
+| `expected_score` | Expected score for this artifact (default 4). |
+| `timeout` | Optional. Per-task timeout in seconds or milliseconds (see utils: values &lt; 86400 are seconds, else milliseconds). |
+| `gpu` | Optional. When `true`, pass `--gpus all` to Docker (Docker mode only). |
+| `interactive` | Optional. When `true`, enable interactive mode for this task (overrides CLI default). |
+| `enable_skill` | Optional. When `true`, enable Skill for this task. |
+| `enable_subagent` | Optional. When `true`, enable Sub-agent for this task. |
+| `keep_container` | Optional. When `false` (default for ae_agent), after the run the workspace is synced from the container to the host, the container is committed as an image, and the container is stopped. When `true`, the container is left running for inspection. |
+
+**Examples**
+
+```sh
+# Host mode, default options
+python src/main.py -i ./data/benchmark/arteval_tasks.jsonl -a ae_agent -o ./outputs/run1
+
+# With interactive mode (TTY required for Docker)
+python src/main.py --interactive -i ./data/benchmark/arteval_tasks.jsonl -a ae_agent -o ./outputs/run2
+
+# Enable Skill and Sub-agent
+python src/main.py --enable-skill --enable-subagent -i ./data/benchmark/arteval_tasks.jsonl -a ae_agent -o ./outputs/run3
+```
+
+**Outputs (when using ae_agent)**
+
+Results are written under the given `save_path`:
+
+- `result.jsonl` — One JSON object per task (task_id, status, score, agent_run_results, etc.).
+- `avg_score.json` — Benchmark summary (final_score, total_tasks).
+- `ae_report_<artifact_id>.md` — Per-task report (status, project path, log file, agent summary, and optional Docker image instructions).
+- `summary.json` — Total and successful task counts and success rate (same format as standalone artifact-agent).
+- When running via the benchmark entry, log paths and agent summary are filled from available data; standalone `python -m ae_agent.main` also produces `ae_log_<artifact_id>.log`.
+
+**Docker + interactive**
+
+For Docker tasks with `interactive: true` (or `--interactive`), the benchmark runs the agent in the foreground via `docker exec -it` so you can interact in the same terminal. This requires a real TTY (e.g. running `python src/main.py ...` in a terminal, not under CI or with redirected stdin). If stdin is not a TTY, the run falls back to non-interactive (background runner) and a warning is logged.
diff --git a/benchmarks/arteval_bench/data/benchmark/ae_agent_smoke/README.md b/benchmarks/arteval_bench/data/benchmark/ae_agent_smoke/README.md
new file mode 100644
index 00000000..c6b0c758
--- /dev/null
+++ b/benchmarks/arteval_bench/data/benchmark/ae_agent_smoke/README.md
@@ -0,0 +1,13 @@
+# AE Agent Smoke Test Artifact
+
+Minimal task for quick testing of ae_agent (host/docker + evaluation). Should complete in under a minute.
+
+## Task
+
+1. In this directory (the artifact root), create a file named **success.txt**.
+2. The file must contain exactly the single character **1** (no newline required).
+3. No other steps are required.
+
+Example (bash): `echo -n 1 > success.txt`
+
+After you finish, the benchmark will run an evaluation script that checks for this file and outputs a score (1 if correct, 0 otherwise).
diff --git a/benchmarks/arteval_bench/data/benchmark/ae_agent_smoke/README_SMOKE_TEST.md b/benchmarks/arteval_bench/data/benchmark/ae_agent_smoke/README_SMOKE_TEST.md
new file mode 100644
index 00000000..bab00be4
--- /dev/null
+++ b/benchmarks/arteval_bench/data/benchmark/ae_agent_smoke/README_SMOKE_TEST.md
@@ -0,0 +1,44 @@
+# AE Agent smoke test
+
+## Purpose
+
+- Test the agent under `src/agents/ae_agent`: **host** and **docker** modes, and the **evaluation script** flow (evaluator runs after the agent and parses score).
+- Task is minimal (create `success.txt` with content `1` in the artifact root); finishes in a few minutes and avoids long runs with full arteval_tasks.
+
+## Files
+
+- **ae_agent_smoke/**: Minimal artifact
+  - `README.md`: Task description (create success.txt with content 1)
+  - `_agent_eval/check.py`: Evaluator; outputs `1` if success.txt exists and contains `1`, else `0`
+- **ae_agent_smoke_test.jsonl**: Two lines
+  - First line: `run_on_host: true`, run ae_agent + evaluator on host
+  - Second line: `run_on_host: false`, run ae_agent + evaluator in Docker
+
+## How to run
+
+From the **benchmarks/arteval_bench** directory:
+
+```bash
+# Set ANTHROPIC_API_KEY or ANTHROPIC_FOUNDRY_API_KEY first
+python src/main.py \
+  -i ./data/benchmark/ae_agent_smoke_test.jsonl \
+  -a ae_agent \
+  -m claude-sonnet-4-5-20250929 \
+  -o ./outputs/ae_agent_smoke_$(date +%Y%m%d_%H%M%S)
+```
+
+- **Host task**: Runs the agent on the host, then runs `python3 _agent_eval/check.py` on the host to get the score.
+- **Docker task**: Runs the agent in the container, then runs the evaluator in the container to get the score; the container is kept running by default for debugging.
+
+Results are under the `-o` directory: `result.jsonl` (one JSON object per line with `score`, `status`, `test_method`, etc.) and `avg_score.json`.
+
+## Interactive mode
+
+The benchmark’s `src/main.py` does not read an `interactive` field from the JSONL, so the command above only covers **non-interactive** runs. To test interactive mode:
+
+- Use ae_agent’s main entry with `--interactive`, and set `"env": "local"` or `"run_on_host": true` / `"env": "docker"` in the JSONL for the task, for example:
+  ```bash
+  cd src/agents/ae_agent
+  python -m ae_agent.main --interactive -i ../../../data/benchmark/ae_agent_smoke_test.jsonl -o ../../../outputs/ae_agent_smoke_int
+  ```
+- In interactive mode, after the first task completes you can keep typing instructions; type `quit` or `exit` to end.
diff --git a/benchmarks/arteval_bench/data/benchmark/ae_agent_smoke/_agent_eval/check.py b/benchmarks/arteval_bench/data/benchmark/ae_agent_smoke/_agent_eval/check.py
new file mode 100644
index 00000000..e0d7c479
--- /dev/null
+++ b/benchmarks/arteval_bench/data/benchmark/ae_agent_smoke/_agent_eval/check.py
@@ -0,0 +1,22 @@
+#!/usr/bin/env python3
+"""Minimal evaluator for ae_agent_smoke: output 1 if success.txt exists and contains '1', else 0.
+
+Output must be a single digit on a line (or last line) for benchmark score parsing.
+"""
+import os
+import sys
+
+def main():
+    root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+    path = os.path.join(root, "success.txt")
+    if os.path.isfile(path):
+        with open(path, "r") as f:
+            content = f.read().strip()
+        if content == "1":
+            print(1)
+            sys.exit(0)
+    print(0)
+    sys.exit(0)
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/arteval_bench/data/benchmark/ae_agent_smoke_test.jsonl b/benchmarks/arteval_bench/data/benchmark/ae_agent_smoke_test.jsonl
new file mode 100644
index 00000000..de29995c
--- /dev/null
+++ b/benchmarks/arteval_bench/data/benchmark/ae_agent_smoke_test.jsonl
@@ -0,0 +1,2 @@
+{"artifact_id": "ae_agent_smoke_host", "artifact_dir": "ae_agent_smoke", "artifact_readme": "ae_agent_smoke/README.md", "evaluator": "python3 _agent_eval/check.py", "expected_score": 1, "env": "local"}
+{"artifact_id": "ae_agent_smoke_docker", "artifact_dir": "ae_agent_smoke", "artifact_readme": "ae_agent_smoke/README.md", "evaluator": "python3 _agent_eval/check.py", "expected_score": 1, "env": "bastoica/ae-agent-ubuntu24.04:latest", "timeout": 120000}
diff --git a/benchmarks/arteval_bench/data/benchmark/arteval_tasks.jsonl b/benchmarks/arteval_bench/data/benchmark/arteval_tasks.jsonl
index 1f46440a..8928ead5 100644
--- a/benchmarks/arteval_bench/data/benchmark/arteval_tasks.jsonl
+++ b/benchmarks/arteval_bench/data/benchmark/arteval_tasks.jsonl
@@ -1,6 +1,6 @@
-{"artifact_id": "sosp24_wasabi", "artifact_dir": "sosp24_wasabi", "artifact_readme": "sosp24_wasabi/wasabi/README.md", "artifact_url": "https://github.com/bastoica/wasabi/tree/sosp24-ae", "evaluator": "sosp24_wasabi/wasabi/_agent_eval/main.py", "expected_score": 4, "docer_env": "bastoica/ae-agent-ubuntu24.04:latest"}
-{"artifact_id": "osdi24_anvil", "artifact_dir": "osdi24_anvil", "artifact_readme": "osdi24_anvil/anvil/README.md", "artifact_url": "https://github.com/anvil-verifier/anvil", "evaluator": "osdi24_anvil/_agent_eval/main.py", "expected_score": 4, "docer_env": "bastoica/ae-agent-ubuntu24.04:latest"}
-{"artifact_id": "sosp23_acto", "artifact_dir": "sosp23_acto", "artifact_readme": "sosp23_acto/acto/README.md", "artifact_url": "https://github.com/xlab-uiuc/acto", "evaluator": "sosp23_acto/_agent_eval/main.py", "expected_score": 4, "docer_env": "bastoica/ae-agent-ubuntu24.04:latest"}
-{"artifact_id": "eurosys25_egwalker", "artifact_dir": "eurosys25_egwalker", "artifact_readme": "eurosys25_egwalker/egwalker/README.md", "artifact_url": "https://github.com/josephg/egwalker-paper", "evaluator": "eurosys25_egwalker/_agent_eval/main.py", "expected_score": 4, "docer_env": "bastoica/ae-agent-ubuntu24.04:latest"}
-{"artifact_id": "eurosys25_depsurf", "artifact_dir": "eurosys25_depsurf", "artifact_readme": "eurosys25_depsurf/depsurf/README.md", "artifact_url": "https://github.com/ShawnZhong/DepSurf", "evaluator": "eurosys25_depsurf/_agent_eval/main.py", "expected_score": 4, "docer_env": "bastoica/ae-agent-ubuntu24.04:latest"}
-{"artifact_id": "osdi24_eet", "artifact_dir": "osdi24_eet", "artifact_readme": "osdi24_eet/eet/README.md", "artifact_url": "https://github.com/JZuming/EET", "evaluator": "osdi24_eet/_agent_eval/main.py", "expected_score": 4, "docer_env": "bastoica/ae-agent-ubuntu24.04:latest"}
\ No newline at end of file
+{"artifact_id": "sosp24_wasabi", "artifact_dir": "sosp24_wasabi", "artifact_readme": "sosp24_wasabi/wasabi/README.md", "artifact_url": "https://github.com/bastoica/wasabi/tree/sosp24-ae", "evaluator": "sosp24_wasabi/wasabi/_agent_eval/main.py", "expected_score": 4, "env": "bastoica/ae-agent-ubuntu24.04:latest"}
+{"artifact_id": "osdi24_anvil", "artifact_dir": "osdi24_anvil", "artifact_readme": "osdi24_anvil/anvil/README.md", "artifact_url": "https://github.com/anvil-verifier/anvil", "evaluator": "osdi24_anvil/_agent_eval/main.py", "expected_score": 4, "env": "bastoica/ae-agent-ubuntu24.04:latest"}
+{"artifact_id": "sosp23_acto", "artifact_dir": "sosp23_acto", "artifact_readme": "sosp23_acto/acto/README.md", "artifact_url": "https://github.com/xlab-uiuc/acto", "evaluator": "sosp23_acto/_agent_eval/main.py", "expected_score": 4, "env": "bastoica/ae-agent-ubuntu24.04:latest"}
+{"artifact_id": "eurosys25_egwalker", "artifact_dir": "eurosys25_egwalker", "artifact_readme": "eurosys25_egwalker/egwalker/README.md", "artifact_url": "https://github.com/josephg/egwalker-paper", "evaluator": "eurosys25_egwalker/_agent_eval/main.py", "expected_score": 4, "env": "bastoica/ae-agent-ubuntu24.04:latest"}
+{"artifact_id": "eurosys25_depsurf", "artifact_dir": "eurosys25_depsurf", "artifact_readme": "eurosys25_depsurf/depsurf/README.md", "artifact_url": "https://github.com/ShawnZhong/DepSurf", "evaluator": "eurosys25_depsurf/_agent_eval/main.py", "expected_score": 4, "env": "bastoica/ae-agent-ubuntu24.04:latest"}
+{"artifact_id": "osdi24_eet", "artifact_dir": "osdi24_eet", "artifact_readme": "osdi24_eet/eet/README.md", "artifact_url": "https://github.com/JZuming/EET", "evaluator": "osdi24_eet/_agent_eval/main.py", "expected_score": 4, "env": "bastoica/ae-agent-ubuntu24.04:latest"}
diff --git a/benchmarks/arteval_bench/env.toml b/benchmarks/arteval_bench/env.toml
index eac33edd..564e06ab 100644
--- a/benchmarks/arteval_bench/env.toml
+++ b/benchmarks/arteval_bench/env.toml
@@ -2,7 +2,7 @@
 AZURE_API_KEY = "XXX"
 AZURE_API_BASE = "XXXX"
 AZURE_API_VERSION = "XXX"
-ANTHROPIC_API_KEY = "sk-XXXX"
+ANTHROPIC_API_KEY = "YOUR_ANTHROPIC_API_KEY"
 
 [hardware]
 use_gpu = false
diff --git a/benchmarks/arteval_bench/run_ae_agent_smoke_test.sh b/benchmarks/arteval_bench/run_ae_agent_smoke_test.sh
new file mode 100755
index 00000000..dba27087
--- /dev/null
+++ b/benchmarks/arteval_bench/run_ae_agent_smoke_test.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+# Run ae_agent smoke test under arteval_bench (host + docker, with evaluation).
+# Usage: ./run_ae_agent_smoke_test.sh [model_name]
+# Default model: claude-sonnet-4-5-20250929
+
+set -e
+BENCH_ROOT="$(cd "$(dirname "$0")" && pwd)"
+cd "$BENCH_ROOT"
+MODEL="${1:-claude-sonnet-4-5-20250929}"
+OUT_DIR="./outputs/ae_agent_smoke_$(date +%Y%m%d_%H%M%S)"
+echo "==> AE Agent smoke test (host + docker + evaluation)"
+echo "    Model: $MODEL"
+echo "    Output: $OUT_DIR"
+echo ""
+python src/main.py \
+  -i ./data/benchmark/ae_agent_smoke_test.jsonl \
+  -a ae_agent \
+  -m "$MODEL" \
+  -o "$OUT_DIR"
+echo ""
+echo "==> Done. Results: $OUT_DIR/result.jsonl and $OUT_DIR/avg_score.json"
diff --git a/benchmarks/arteval_bench/src/agents/ae_agent/README.md b/benchmarks/arteval_bench/src/agents/ae_agent/README.md
new file mode 100644
index 00000000..f630da1d
--- /dev/null
+++ b/benchmarks/arteval_bench/src/agents/ae_agent/README.md
@@ -0,0 +1,62 @@
+# AE Agent (ArtEval sub-agent)
+
+This agent is the **ae_agent** for the system-intelligence-benchmark ArtEval benchmark, with the same logic as the standalone [ae-agent](https://github.com/Couen/ae-agent) repo. It runs inside the benchmark container using the Claude Agent SDK to execute artifact evaluation tasks.
+
+## Files
+
+- **install.sh**: Installs `claude-agent-sdk` inside the container for use by runner.py.
+- **runner.sh**: Entry script; invoked as `runner.sh <model> <task_or_path>`. Uses `/agent/current_task.txt` when the benchmark passes the task via file.
+- **runner.py**: Runs the task with Claude Agent SDK; supports 429 rate-limit retry; second argument can be task text or path to a task file. Artifact path in container is `/repo`.
+- **run_eval.py**: Single-task orchestration: `env='local'` runs on host, otherwise runs in Docker (requires swerex/swe-rex).
+- **main.py**: CLI entry for batch runs from JSONL; supports host or Docker per task.
+- **utils.py**: Timeout, task/path helpers, Tee, reports, summary (used by runner, main, run_eval).
+- **__init__.py**: Package marker.
+
+## Usage from the benchmark
+
+From the benchmark root (`benchmarks/arteval_bench/`):
+
+```bash
+python src/main.py -i ./data/benchmark/arteval_tasks.jsonl -a ae_agent -m claude-sonnet-4-5-20250929 -o ./outputs/ae_agent_run
+```
+
+You can also use `-a ae-agent`; it is equivalent to `ae_agent`.
+
+The benchmark will:
+
+1. Upload this agent to `/agent` in the container.
+2. For ae_agent: write the task to `/agent/current_task.txt`, then run `runner.sh "$model" /agent/current_task.txt` (avoids shell quoting issues with large tasks).
+3. Use long-running and live-log behavior (48h timeout, streamed logs, remove `_agent_eval` before run and re-upload before evaluation, container kept for debugging).
+4. **Evaluation script flow** (same as claude_sdk): after the agent finishes, run the JSONL `evaluator` (test_method), e.g. `cd /repo && python _agent_eval/main.py`, parse output for `score` and write to result.
+5. If set, pass through `ANTHROPIC_API_KEY`, `ANTHROPIC_FOUNDRY_API_KEY`, `ANTHROPIC_FOUNDRY_BASE_URL`, `CLAUDE_CODE_USE_FOUNDRY`.
+
+**Evaluation flow on host**: When `run_on_host=True` and the agent is ae_agent, `run_eval_in_env.run_eval_on_host` calls this package's `run_agent_then_eval()`: run the agent first, then run `test_method` on the host (e.g. `cd project_path && python _agent_eval/main.py`), parse score with `utils.parse_eval_score()`, and return a result with the same shape as the Docker path (`score`, `test_method`, `status`).
+
+## Dependencies
+
+- Python 3; `claude-agent-sdk` is installed in the container via `install.sh`.
+- When running in Docker via the benchmark's `run_eval_in_env.py`, install `swerex` on the host (the benchmark includes it). When using this directory's `main.py` for Docker mode standalone, you also need `swe-rex`.
+
+## Running on host (local)
+
+You can run tasks on the **host** from this directory (without the benchmark's Docker flow):
+
+1. **Single or batch via main.py**  
+   Use a JSONL where each line can set `"env": "local"` or `"run_on_host": true` to run that task on the host; others run in Docker (requires swerex).
+
+   ```bash
+   cd benchmarks/arteval_bench/src/agents/ae_agent
+   python -m ae_agent.main -i /path/to/tasks.jsonl -a ae_agent -m claude-sonnet-4-5-20250929 -o ./outputs/host_run
+   ```
+
+2. **Host mode requirements**  
+   - Set `ANTHROPIC_API_KEY` or `ANTHROPIC_FOUNDRY_API_KEY`  
+   - Docker installed and running (for prereq check; agent runs on host)  
+   - `pip install claude-agent-sdk`
+
+3. **Docker mode from this directory**  
+   If the JSONL has `"env": "docker"` (or `run_on_host` is not set), `main.py` runs that task in Docker via `run_eval.py` (requires `swe-rex`/`swerex`).
+
+## Relation to the standalone ae-agent repo
+
+The standalone ae-agent repo provides the same host/Docker CLI. This sub-agent includes both the **in-container** runner (used by the benchmark's `run_eval_in_env.py`) and **host/local** mode via `main.py` and `run_eval.py`.
diff --git a/benchmarks/arteval_bench/src/agents/ae_agent/__init__.py b/benchmarks/arteval_bench/src/agents/ae_agent/__init__.py
new file mode 100644
index 00000000..ca489f55
--- /dev/null
+++ b/benchmarks/arteval_bench/src/agents/ae_agent/__init__.py
@@ -0,0 +1,23 @@
+"""AE Agent - A tool for running Claude Agent SDK on artifact evaluation tasks.
+
+Output files (under save_path):
+- ae_report_<artifact_id>.md: Per-artifact report with status and agent summary
+- ae_log_<artifact_id>.log: Per-artifact execution log
+- result.jsonl: Per-task results (one JSON per line)
+- summary.json: Overall statistics
+"""
+
+from .main import cli_main, main
+from .run_eval import run_agent_then_eval, run_eval
+from .runner import build_system_prompt, run_agent
+from .utils import parse_eval_score
+
+__all__ = [
+    'build_system_prompt',
+    'cli_main',
+    'main',
+    'parse_eval_score',
+    'run_agent',
+    'run_agent_then_eval',
+    'run_eval',
+]
diff --git a/benchmarks/arteval_bench/src/agents/ae_agent/install.sh b/benchmarks/arteval_bench/src/agents/ae_agent/install.sh
new file mode 100644
index 00000000..829de33d
--- /dev/null
+++ b/benchmarks/arteval_bench/src/agents/ae_agent/install.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+# Setup agent running environment inside Docker container.
+# Ensures claude-agent-sdk is available so runner.py can import claude_agent_sdk.
+set -e
+if ! python3 -c "import claude_agent_sdk" 2>/dev/null; then
+  echo "Installing claude-agent-sdk..."
+  pip3 install claude-agent-sdk==0.1.24 || pip3 install --break-system-packages claude-agent-sdk==0.1.24 || true
+  if ! python3 -c "import claude_agent_sdk"; then
+    echo "WARNING: claude_agent_sdk still not importable; runner may fail."
+  fi
+fi
+echo "Agent environment ready."
diff --git a/benchmarks/arteval_bench/src/agents/ae_agent/main.py b/benchmarks/arteval_bench/src/agents/ae_agent/main.py
new file mode 100644
index 00000000..ac39a3a7
--- /dev/null
+++ b/benchmarks/arteval_bench/src/agents/ae_agent/main.py
@@ -0,0 +1,307 @@
+"""Main entry point for running artifact tasks.
+
+Supports both:
+- Run from this directory: env=local (host) or env=docker per task in JSONL.
+- Used as in-container runner when benchmark (arteval_bench) uploads this agent to /agent.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import logging
+import os
+import sys
+from dataclasses import dataclass
+from datetime import datetime
+
+from .run_eval import make_error_result, run_eval
+from .utils import (
+    AGENT_SUMMARY_FALLBACK_MAX,
+    DEFAULT_MODEL,
+    LOG_OUTPUT_TRUNCATE_BYTES,
+    SUMMARY_BASENAME_TEMPLATE,
+    SUMMARY_INSTRUCTION,
+    Tee,
+    compute_and_write_summary,
+    docker_image_from_item,
+    enable_skill_from_item,
+    enable_subagent_from_item,
+    env_from_item,
+    get_task,
+    gpu_from_item,
+    interactive_from_item,
+    read_task_from_file,
+    resolve_project_path,
+    safe_task_id,
+    timeout_ms_from_item,
+    write_task_report,
+)
+
+
+def _build_task_with_summary(task: str, safe_id: str) -> tuple[str, str]:
+    """Append summary instruction to task. Returns (task, summary_basename)."""
+    summary_basename = SUMMARY_BASENAME_TEMPLATE.format(safe_id=safe_id)
+    full_task = task.rstrip() + SUMMARY_INSTRUCTION.format(basename=summary_basename)
+    return full_task, summary_basename
+
+
+def _persist_result(save_path: str, result: dict, log_path: str) -> None:
+    """Write result to result.jsonl and append run output to log."""
+    with open(f'{save_path}/result.jsonl', 'a+', encoding='utf-8') as fw:
+        fw.write(json.dumps(result, ensure_ascii=False) + '\n')
+    with open(log_path, 'a', encoding='utf-8') as lf:
+        lf.write(f'\nTask finished at {result["timestamp"]}, status: {result.get("status", "unknown")}\n')
+        lf.write('\n--- Agent run output ---\n')
+        run_out = str(result.get('agent_run_results', ''))
+        lf.write(run_out[:LOG_OUTPUT_TRUNCATE_BYTES])
+        if len(run_out) > LOG_OUTPUT_TRUNCATE_BYTES:
+            lf.write('\n... (truncated)\n')
+
+
+def _gather_agent_summary(project_path: str, summary_basename: str, result: dict) -> str:
+    """Read agent summary file or fallback to truncated run output."""
+    summary_file = os.path.join(project_path, summary_basename)
+    if os.path.isfile(summary_file):
+        try:
+            with open(summary_file, encoding='utf-8') as f:
+                return f.read()
+        except OSError as e:
+            logging.warning('Failed to read summary file %s: %s', summary_file, e)
+    fallback = str(result.get('agent_run_results', ''))[:AGENT_SUMMARY_FALLBACK_MAX]
+    return fallback or '(No summary captured)'
+
+
+def _persist_skipped(save_path: str, task_id: str, message: str) -> None:
+    """Append one result line for a skipped task so summary total is accurate."""
+    result = {
+        'task_id': task_id,
+        'status': 'skipped',
+        'message': message,
+        'timestamp': datetime.now().isoformat(),
+    }
+    with open(f'{save_path}/result.jsonl', 'a+', encoding='utf-8') as fw:
+        fw.write(json.dumps(result, ensure_ascii=False) + '\n')
+
+
+def _run_single_task(
+    item: dict,
+    model: str,
+    agent: str,
+    save_path: str,
+    input_file: str,
+    interactive_default: bool,
+    enable_skill_default: bool = False,
+    enable_subagent_default: bool = False,
+) -> None:
+    """Process a single JSONL task: parse, run, write results and report."""
+    env = env_from_item(item)
+    docker_image = docker_image_from_item(item, env=env)
+    use_gpu = gpu_from_item(item)
+    interactive = interactive_from_item(item) or interactive_default
+    enable_skill = enable_skill_from_item(item, enable_skill_default)
+    enable_subagent = enable_subagent_from_item(item, enable_subagent_default)
+    task_file = item.get('artifact_readme', None)
+    task_id = item.get('artifact_id', None)
+    timeout_ms = timeout_ms_from_item(item)
+    safe_id = safe_task_id(task_id)
+
+    project_path, path_error = resolve_project_path(item, input_file, save_path)
+    if path_error:
+        print(path_error)
+        _persist_skipped(save_path, task_id or safe_id, path_error)
+        return
+    print(f'Project path: {project_path}')
+
+    raw_task = read_task_from_file(project_path, task_file) if task_file else get_task('README.md')
+    task, summary_basename = _build_task_with_summary(raw_task, safe_id)
+
+    task_file_path = os.path.join(save_path, f'current_task_{safe_id}.txt')
+    with open(task_file_path, 'w', encoding='utf-8') as f:
+        f.write(task)
+
+    timeout_str = str(timeout_ms) if timeout_ms is not None else 'default'
+    print(f'Task {task_id}: env={env}, timeout_ms={timeout_str}, gpu={use_gpu}, interactive={interactive}, enable_skill={enable_skill}, enable_subagent={enable_subagent}')
+
+    log_path = os.path.join(save_path, f'ae_log_{safe_id}.log')
+    with open(log_path, 'w', encoding='utf-8') as lf:
+        lf.write(f'Task {task_id} started at {datetime.now().isoformat()}\n')
+        lf.write(f'Project path: {project_path}\n')
+        lf.write(f'Env: {env}\n\n')
+
+    # Run task (stdout/stderr teed to log), then persist result and report.
+    # Note: For env='local', agent_path is ignored; the in-process runner (this package) is used.
+    old_stdout, old_stderr = sys.stdout, sys.stderr
+    try:
+        with Tee(sys.stdout, log_path) as tee_out:
+            with Tee(sys.stderr, log_path) as tee_err:
+                sys.stdout, sys.stderr = tee_out, tee_err
+                result = run_eval(
+                    env=env,
+                    docker_image=docker_image,
+                    project_path=project_path,
+                    task_id=task_id,
+                    task=task,
+                    task_file_path=task_file_path,
+                    model=model,
+                    agent_path=agent,
+                    save_path=save_path,
+                    timeout_ms=timeout_ms,
+                    use_gpu=use_gpu,
+                    interactive=interactive,
+                    enable_skill=enable_skill,
+                    enable_subagent=enable_subagent,
+                )
+    except Exception as e:
+        sys.stdout, sys.stderr = old_stdout, old_stderr
+        logging.exception('run_eval failed for task %s: %s', task_id, e)
+        result = make_error_result(task_id, task, project_path, str(e), env)
+    finally:
+        sys.stdout, sys.stderr = old_stdout, old_stderr
+
+    result['timestamp'] = datetime.now().isoformat()
+    result['log_file'] = log_path
+    _persist_result(save_path, result, log_path)
+
+    agent_summary = _gather_agent_summary(project_path, summary_basename, result)
+    write_task_report(save_path, safe_id, task_id, result, log_path, agent_summary)
+    print(f'Task {task_id} completed. Status: {result.get("status", "unknown")}')
+
+
+def main(input_file, model, agent, save_path, interactive_default: bool = False, enable_skill_default: bool = False, enable_subagent_default: bool = False):
+    """Main function for running tasks."""
+    if not os.path.isfile(input_file):
+        logging.error('Input file not found: %s', input_file)
+        sys.exit(1)
+
+    print(f'Using model: {model}, agent: {agent}')
+
+    with open(input_file, encoding='utf-8') as f:
+        for line_no, line in enumerate(f, start=1):
+            if not line.strip():
+                continue
+            try:
+                item = json.loads(line)
+            except json.JSONDecodeError as e:
+                print(f'Skipping invalid JSON at line {line_no}: {e}')
+                _persist_skipped(save_path, f'line_{line_no}', f'Invalid JSON: {e}')
+                continue
+
+            _run_single_task(
+                item=item,
+                model=model,
+                agent=agent,
+                save_path=save_path,
+                input_file=input_file,
+                interactive_default=interactive_default,
+                enable_skill_default=enable_skill_default,
+                enable_subagent_default=enable_subagent_default,
+            )
+
+    total_count, success_count = compute_and_write_summary(save_path)
+    print(f'All tasks completed: {success_count}/{total_count} succeeded.')
+
+
+@dataclass
+class _ResolvedConfig:
+    """Resolved CLI configuration ready for main()."""
+
+    input_file: str
+    model: str
+    agent: str
+    save_path: str
+    interactive_default: bool
+    enable_skill_default: bool
+    enable_subagent_default: bool
+
+
+def _parse_args() -> argparse.Namespace:
+    """Parse command-line arguments."""
+    parser = argparse.ArgumentParser(description='AE Agent - Run Claude Agent SDK on artifact tasks')
+    parser.add_argument(
+        '-i',
+        '--input_file',
+        help='Input JSONL file with tasks',
+        default='./data/benchmark/arteval_tasks.jsonl',
+    )
+    parser.add_argument('-o', '--save_path', help='Result save path', default=None)
+    parser.add_argument(
+        '-a',
+        '--agent',
+        help='Agent name (default: ae-agent)',
+        default='ae-agent',
+    )
+    parser.add_argument(
+        '-m',
+        '--model_name',
+        help='Model Name',
+        default=DEFAULT_MODEL,
+    )
+    parser.add_argument(
+        '--interactive',
+        action='store_true',
+        help='Enable interactive mode (continue giving agent instructions after task completes)',
+    )
+    parser.add_argument(
+        '--enable-skill',
+        action='store_true',
+        help='Enable Claude Agent SDK Skill (load from ~/.claude/skills/ and .claude/skills/)',
+    )
+    parser.add_argument(
+        '--enable-subagent',
+        action='store_true',
+        help='Enable Claude Agent SDK Sub-agent (Task tool)',
+    )
+    return parser.parse_args()
+
+
+def _resolve_paths(args: argparse.Namespace) -> _ResolvedConfig:
+    """Resolve paths and agent from parsed args."""
+    model_name = args.model_name
+    agent = args.agent
+    input_file = args.input_file
+    save_path = args.save_path
+
+    if save_path is None:
+        str_model_name = model_name.replace('/', '_').lower()
+        timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
+        save_path = os.path.join('./outputs', f'ae_{str_model_name}_ae-agent_{timestamp}')
+
+    # When running from this directory (standalone or as arteval_bench agent), use script dir as agent path
+    if agent in ('ae-agent', 'ae_agent', 'claude_sdk'):
+        agent = os.path.dirname(os.path.abspath(__file__))
+
+    save_path = os.path.abspath(os.path.expanduser(save_path))
+    os.makedirs(save_path, exist_ok=True)
+
+    return _ResolvedConfig(
+        input_file=input_file,
+        model=model_name,
+        agent=agent,
+        save_path=save_path,
+        interactive_default=getattr(args, 'interactive', False),
+        enable_skill_default=getattr(args, 'enable_skill', False),
+        enable_subagent_default=getattr(args, 'enable_subagent', False),
+    )
+
+
+def cli_main():
+    """CLI entry point."""
+    args = _parse_args()
+    config = _resolve_paths(args)
+    print(f'Input file: {config.input_file}')
+    print(f'Save path: {config.save_path}')
+    print(f'Agent path: {config.agent}')
+    main(
+        config.input_file,
+        config.model,
+        config.agent,
+        config.save_path,
+        interactive_default=config.interactive_default,
+        enable_skill_default=config.enable_skill_default,
+        enable_subagent_default=config.enable_subagent_default,
+    )
+
+
+if __name__ == '__main__':
+    cli_main()
diff --git a/benchmarks/arteval_bench/src/agents/ae_agent/run_eval.py b/benchmarks/arteval_bench/src/agents/ae_agent/run_eval.py
new file mode 100644
index 00000000..75be027c
--- /dev/null
+++ b/benchmarks/arteval_bench/src/agents/ae_agent/run_eval.py
@@ -0,0 +1,1015 @@
+"""Orchestration for executing artifact tasks in Docker or on host.
+
+Single entry point: run_eval(env, project_path, task_id, ...).
+- env='local'  -> _run_local()      -> runner.run_agent() directly on host
+- env != 'local' -> _run_in_docker() -> runner.py executed inside container
+"""
+
+from __future__ import annotations
+
+import asyncio
+import json
+import logging
+import os
+import shutil
+import subprocess
+import sys
+import tempfile
+import time
+from dataclasses import dataclass
+from pathlib import Path
+
+from .runner import run_agent
+from .utils import (
+    DEFAULT_DOCKER_IMAGE,
+    apply_timeout_env,
+    has_api_key,
+    is_local_env,
+    parse_eval_score,
+    resolve_timeout_ms,
+    safe_task_id,
+    status_from_exit_code,
+    timeout_env_dict,
+)
+
+SWEREX_AVAILABLE = False
+
+
+def _import_swerex():
+    """Try importing swerex under both package names (swerex and swe_rex).
+
+    The package was renamed; we support both for backward compatibility.
+    Returns (DockerDeploymentConfig, BashAction, CreateBashSessionRequest, UploadRequest)
+    or raises ImportError.
+    """
+    for pkg in ('swerex', 'swe_rex'):
+        try:
+            mod_docker = __import__(f'{pkg}.deployment.docker', fromlist=['DockerDeploymentConfig'])
+            mod_runtime = __import__(
+                f'{pkg}.runtime.abstract', fromlist=['BashAction', 'CreateBashSessionRequest', 'UploadRequest']
+            )
+            return (
+                mod_docker.DockerDeploymentConfig,
+                mod_runtime.BashAction,
+                mod_runtime.CreateBashSessionRequest,
+                mod_runtime.UploadRequest,
+            )
+        except ImportError:
+            continue
+    raise ImportError("Neither 'swerex' nor 'swe_rex' is installed")
+
+
+try:
+    DockerDeploymentConfig, BashAction, CreateBashSessionRequest, UploadRequest = _import_swerex()
+    SWEREX_AVAILABLE = True
+except ImportError:
+    logging.warning('swerex/swe-rex not available. Docker mode will not work.')
+
+
+# Progress log every 5 minutes when runner is still running.
+_PROGRESS_LOG_INTERVAL_SEC = 300
+
+# Poll interval for checking runner status.
+_POLL_INTERVAL_SEC = 10.0
+
+
+@dataclass
+class _RunnerResult:
+    """Result from a Docker runner process."""
+
+    exit_code: int
+    output: str
+
+
+def _make_eval_result(
+    task_id: str,
+    task: str,
+    project_path: str,
+    agent_output: str,
+    status: str,
+    run_on_host: bool,
+    *,
+    container_id: str | None = None,
+    saved_image: str | None = None,
+    container_stopped: bool = False,
+    message_count: int | None = None,
+    score: int | None = None,
+    test_method: str | None = None,
+) -> dict:
+    """Build unified eval result dict for both host and Docker modes."""
+    result = {
+        'task_id': task_id,
+        'task': task,
+        'project_path': project_path,
+        'agent_run_results': agent_output,
+        'status': status,
+        'run_on_host': run_on_host,
+        'container_id': container_id,
+        'saved_image': saved_image,
+        'container_stopped': container_stopped,
+    }
+    if message_count is not None:
+        result['message_count'] = message_count
+    if score is not None:
+        result['score'] = score
+    if test_method is not None:
+        result['test_method'] = test_method
+    return result
+
+
+def make_error_result(
+    task_id: str,
+    task: str,
+    project_path: str,
+    error_message: str,
+    env: str,
+) -> dict:
+    """Build result dict for run_eval failure (exception/timeout). Same shape as normal result."""
+    return _make_eval_result(
+        task_id,
+        task,
+        project_path,
+        error_message,
+        'error',
+        is_local_env(env),
+    )
+
+
+# ---------------------------------------------------------------------------
+# Host mode
+# ---------------------------------------------------------------------------
+
+
+def _check_host_prerequisites() -> bool:
+    """Check that docker, python, and API key are available on the host."""
+    if not shutil.which('docker'):
+        logging.error('Docker is not installed on host.')
+        return False
+    if subprocess.run(['docker', 'ps'], capture_output=True, timeout=10).returncode != 0:
+        logging.error('Docker is not running on host.')
+        return False
+    if not has_api_key():
+        logging.error('Neither ANTHROPIC_API_KEY nor ANTHROPIC_FOUNDRY_API_KEY is set.')
+        return False
+    return True
+
+
+def _write_claude_settings(timeout_ms: int):
+    """Write ~/.claude/settings.json with timeout configuration."""
+    claude_dir = Path.home() / '.claude'
+    claude_dir.mkdir(exist_ok=True)
+    settings = {'env': timeout_env_dict(timeout_ms)}
+    with open(claude_dir / 'settings.json', 'w', encoding='utf-8') as f:
+        json.dump(settings, f, indent=2)
+
+
+async def _run_local(
+    project_path,
+    task_id,
+    task,
+    model,
+    timeout_ms: int,
+    *,
+    skip_prereq_check: bool = False,
+    interactive: bool = False,
+    enable_skill: bool = False,
+    enable_subagent: bool = False,
+):
+    """Run one task on host by delegating to runner.run_agent()."""
+    print('=' * 80)
+    print('Running task on HOST MACHINE')
+    print('=' * 80)
+
+    if not skip_prereq_check and not _check_host_prerequisites():
+        raise RuntimeError('Host prerequisites check failed')
+
+    _write_claude_settings(timeout_ms)
+    # run_eval() already calls apply_timeout_env() for local; no need to duplicate here.
+
+    project_path = os.path.abspath(project_path)
+    if not os.path.isdir(project_path):
+        raise RuntimeError(f'Project path does not exist: {project_path}')
+
+    print(f'Project path: {project_path}')
+    print(f'Task ID: {task_id}')
+    print(f'Model: {model}')
+
+    agent_result = await run_agent(
+        model,
+        task,
+        env='local',
+        artifact_path=project_path,
+        timeout_ms=timeout_ms,
+        interactive=interactive,
+        enable_skill=enable_skill,
+        enable_subagent=enable_subagent,
+    )
+
+    return _make_eval_result(
+        task_id,
+        task,
+        project_path,
+        agent_result['output'],
+        status_from_exit_code(agent_result['exit_code']),
+        run_on_host=True,
+        message_count=agent_result['message_count'],
+    )
+
+
+# ---------------------------------------------------------------------------
+# Benchmark flow: run agent then evaluation script (same as claude_sdk in arteval_bench)
+# ---------------------------------------------------------------------------
+
+# Default timeout for running the evaluation script (e.g. pytest or oracle script) on host.
+_EVAL_SCRIPT_TIMEOUT_SEC = 600
+
+
+async def _run_agent_then_eval_async(
+    project_path: str,
+    task_id: str,
+    task: str,
+    model: str,
+    test_method: str | None,
+    save_path: str,
+    timeout_ms: int | None = None,
+    *,
+    skip_prereq_check: bool = False,
+    interactive: bool = False,
+    enable_skill: bool = False,
+    enable_subagent: bool = False,
+) -> dict:
+    """Run agent on host, then run evaluation script (test_method); return result with score.
+
+    Used by arteval_bench when run_on_host=True and agent is ae_agent. Same flow as claude_sdk:
+    agent run → run test_method (e.g. cd project_path && python _agent_eval/main.py) → parse score.
+    """
+    timeout_ms = resolve_timeout_ms(timeout_ms)
+    if not skip_prereq_check and not _check_host_prerequisites():
+        raise RuntimeError('Host prerequisites check failed')
+    apply_timeout_env(timeout_ms)
+    _write_claude_settings(timeout_ms)
+
+    project_path = os.path.abspath(project_path)
+    if not os.path.isdir(project_path):
+        raise RuntimeError(f'Project path does not exist: {project_path}')
+
+    # 1. Run agent
+    agent_result = await run_agent(
+        model,
+        task,
+        env='local',
+        artifact_path=project_path,
+        timeout_ms=timeout_ms,
+        interactive=interactive,
+        enable_skill=enable_skill,
+        enable_subagent=enable_subagent,
+    )
+    agent_output = agent_result['output']
+    agent_status = status_from_exit_code(agent_result['exit_code'])
+
+    # 2. Run evaluation script if provided
+    if test_method and test_method.strip():
+        try:
+            # Evaluator from JSONL is a path to main.py; run with python from project root.
+            if test_method.strip().endswith('.py'):
+                eval_cmd = f'cd {project_path} && python {test_method.strip()}'
+            else:
+                eval_cmd = f'cd {project_path} && {test_method}'
+            eval_result = subprocess.run(
+                eval_cmd,
+                shell=True,
+                capture_output=True,
+                text=True,
+                timeout=_EVAL_SCRIPT_TIMEOUT_SEC,
+            )
+            test_output = (eval_result.stdout or '').strip()
+            score = parse_eval_score(test_output)
+            status = 'success' if agent_status == 'success' else agent_status
+        except subprocess.TimeoutExpired:
+            test_output = '(evaluation script timed out)'
+            score = 0
+            status = 'error'
+        except Exception as e:
+            test_output = str(e)
+            score = 0
+            status = f'error: {e}'
+    else:
+        test_output = ''
+        score = 0
+        status = agent_status
+
+    return _make_eval_result(
+        task_id,
+        task,
+        project_path,
+        agent_output,
+        status,
+        run_on_host=True,
+        score=score,
+        test_method=test_method or '',
+    )
+
+
+def run_agent_then_eval(
+    project_path: str,
+    task_id: str,
+    task: str,
+    model: str,
+    test_method: str | None,
+    save_path: str,
+    timeout_ms: int | None = None,
+    *,
+    skip_prereq_check: bool = False,
+    interactive: bool = False,
+    enable_skill: bool = False,
+    enable_subagent: bool = False,
+) -> dict:
+    """Synchronous entry: run agent on host then evaluation script; return result with score.
+
+    Called by arteval_bench run_eval_in_env.run_eval_on_host when agent is ae_agent.
+    """
+    return asyncio.run(
+        _run_agent_then_eval_async(
+            project_path,
+            task_id,
+            task,
+            model,
+            test_method,
+            save_path,
+            timeout_ms=timeout_ms,
+            skip_prereq_check=skip_prereq_check,
+            interactive=interactive,
+            enable_skill=enable_skill,
+            enable_subagent=enable_subagent,
+        )
+    )
+
+
+# ---------------------------------------------------------------------------
+# Docker helpers
+# ---------------------------------------------------------------------------
+
+
+def _validate_agent_path(agent_path: str) -> None:
+    """Ensure agent_path exists and has required files."""
+    if not agent_path or not os.path.isdir(agent_path):
+        raise RuntimeError(f'Agent path does not exist: {agent_path}')
+    for name in ('runner.sh', 'runner.py', 'install.sh'):
+        if not os.path.isfile(os.path.join(agent_path, name)):
+            raise RuntimeError(f'Agent path missing required file: {name}')
+
+
+def _stdin_is_tty() -> bool:
+    """Return True if stdin is a real TTY (required for docker exec -it)."""
+    return hasattr(sys.stdin, 'isatty') and sys.stdin.isatty()
+
+
+def _run_docker_cmd(
+    args: list[str],
+    *,
+    timeout: int = 60,
+    on_success_message: str | None = None,
+    on_fail_message: str = 'docker command failed',
+) -> bool:
+    """Run a docker subprocess. Return True if returncode is 0, else False and log."""
+    try:
+        r = subprocess.run(
+            args,
+            capture_output=True,
+            text=True,
+            timeout=timeout,
+        )
+        if r.returncode == 0:
+            if on_success_message:
+                print(on_success_message)
+            return True
+        logging.warning('%s: %s', on_fail_message, (r.stderr or r.stdout or '').strip())
+        return False
+    except subprocess.TimeoutExpired:
+        logging.warning('docker command timed out (timeout=%ds)', timeout)
+        return False
+    except (OSError, subprocess.SubprocessError) as e:
+        logging.warning('docker command error: %s', e)
+        return False
+
+
+def _merge_tree(src_dir: str, dst_dir: str, exclude: tuple[str, ...] = ('.venv', '.git', '__pycache__')) -> None:
+    """Merge src_dir into dst_dir (copy missing/updated from src into dst)."""
+    os.makedirs(dst_dir, exist_ok=True)
+    for name in os.listdir(src_dir):
+        if name in exclude:
+            continue
+        src_path = os.path.join(src_dir, name)
+        dst_path = os.path.join(dst_dir, name)
+        if os.path.isdir(src_path):
+            if os.path.isdir(dst_path):
+                _merge_tree(src_path, dst_path, exclude)
+            elif os.path.exists(dst_path):
+                logging.warning('Sync skip (destination not a dir): %s', dst_path)
+            else:
+                shutil.copytree(src_path, dst_path)
+        else:
+            try:
+                shutil.copy2(src_path, dst_path)
+            except OSError as e:
+                logging.warning('Sync copy failed %s -> %s: %s', src_path, dst_path, e)
+
+
+def _sync_workspace(container_id: str, project_path: str) -> None:
+    """Copy /repo from container back to host project_path.
+
+    Uses a temp copy plus merge with excludes to avoid overwriting host .venv
+    (e.g. when container has .venv/lib64 as a directory and host has it as a
+    symlink, which would cause 'cannot overwrite non-directory with directory').
+    """
+    project_abs = os.path.abspath(project_path)
+    if not os.path.isdir(project_abs):
+        print(f'WARNING: project_path missing, skipping sync: {project_abs}')
+        return
+
+    # Exclude .venv* and .git to avoid overwriting host venv or permission issues
+    def _skip_sync(name: str) -> bool:
+        return name == '.git' or name == '.venv' or name.startswith('.venv-')
+
+    with tempfile.TemporaryDirectory(prefix='ae_sync_') as tmp:
+        dest_tmp = os.path.join(tmp, 'repo')
+        if not _run_docker_cmd(
+            ['docker', 'cp', f'{container_id}:/repo', dest_tmp],
+            timeout=600,
+            on_fail_message='docker cp (to temp) failed',
+        ):
+            return
+        # docker cp container:/repo dest_tmp puts repo contents into dest_tmp
+        repo_src = dest_tmp
+        for name in os.listdir(repo_src):
+            if _skip_sync(name):
+                continue
+            src_path = os.path.join(repo_src, name)
+            dst_path = os.path.join(project_abs, name)
+            try:
+                if os.path.isdir(src_path):
+                    if os.path.exists(dst_path):
+                        _merge_tree(src_path, dst_path)
+                    else:
+                        shutil.copytree(src_path, dst_path)
+                else:
+                    shutil.copy2(src_path, dst_path)
+            except (OSError, shutil.Error) as e:
+                logging.warning('Sync item %s failed: %s', name, e)
+        print(f'Synced container /repo -> {project_abs}')
+
+
+def _commit_container(container_id: str, task_id: str) -> str | None:
+    """Commit container state as a Docker image. Returns image tag or None."""
+    sid = safe_task_id(task_id, fallback='unknown_task')
+    image_tag = f'ae-agent-{sid.lower()}:latest'
+    if not _run_docker_cmd(
+        ['docker', 'commit', container_id, image_tag],
+        timeout=600,
+        on_fail_message='docker commit failed',
+    ):
+        return None
+    return image_tag
+
+
+def _stop_container(container_id: str) -> bool:
+    """Stop a Docker container. Returns True if stopped successfully."""
+    return _run_docker_cmd(
+        ['docker', 'stop', container_id],
+        timeout=60,
+        on_success_message=f'Stopped container {container_id}.',
+        on_fail_message='docker stop failed',
+    )
+
+
+def _save_container(
+    container_id: str,
+    project_path: str,
+    task_id: str,
+) -> tuple[str | None, bool]:
+    """Sync workspace, commit image, and stop container."""
+    _sync_workspace(container_id, project_path)
+    image_tag = _commit_container(container_id, task_id)
+    stopped = _stop_container(container_id)
+    return image_tag, stopped
+
+
+def save_container_after_run(container_id: str, project_path: str, task_id: str) -> tuple[str | None, bool]:
+    """Sync workspace from container to host, commit as image, stop container.
+
+    Public entry for run_eval_in_env when keep_container=False (original artifact-agent behavior).
+    Returns (saved_image_tag, container_stopped).
+    """
+    return _save_container(container_id, project_path, task_id)
+
+
+async def _get_container_id(runtime) -> str | None:
+    """Get container hostname/ID from inside the container."""
+    try:
+        cid = (
+            await _run_bash(
+                runtime,
+                'cat /etc/hostname 2>/dev/null || hostname 2>/dev/null || echo ""',
+                10.0,
+            )
+        ).strip()
+        return cid if cid and cid != 'unknown' else None
+    except (AttributeError, TypeError, ValueError) as e:
+        logging.debug('Could not get container ID: %s', e)
+        return None
+
+
+def _shell_escape(s: str) -> str:
+    """Escape a string for use inside single-quoted shell arguments."""
+    return s.replace("'", "'\"'\"'")
+
+
+def _build_api_env_dict(
+    timeout_ms: int,
+    *,
+    enable_skill: bool = False,
+    enable_subagent: bool = False,
+) -> dict[str, str]:
+    """Build env vars dict for API keys, Foundry, timeouts, and SDK options.
+
+    Single source of truth for _docker_exec_env_args and _setup_container_env.
+    """
+    api_key = os.environ.get('ANTHROPIC_API_KEY')
+    foundry_key = os.environ.get('ANTHROPIC_FOUNDRY_API_KEY')
+    env = dict(timeout_env_dict(timeout_ms))
+    if api_key:
+        env['ANTHROPIC_API_KEY'] = api_key
+    if foundry_key:
+        env['ANTHROPIC_FOUNDRY_API_KEY'] = foundry_key
+        if not api_key:
+            env['ANTHROPIC_API_KEY'] = foundry_key
+    foundry_url = os.environ.get('ANTHROPIC_FOUNDRY_BASE_URL')
+    if foundry_url:
+        env['ANTHROPIC_FOUNDRY_BASE_URL'] = foundry_url
+    if os.environ.get('CLAUDE_CODE_USE_FOUNDRY') == '1':
+        env['CLAUDE_CODE_USE_FOUNDRY'] = '1'
+    if enable_skill:
+        env['AE_ENABLE_SKILL'] = '1'
+    if enable_subagent:
+        env['AE_ENABLE_SUBAGENT'] = '1'
+    return env
+
+
+def _docker_exec_env_args(
+    timeout_ms: int,
+    *,
+    enable_skill: bool = False,
+    enable_subagent: bool = False,
+) -> list[str]:
+    """Build -e VAR=value args for docker exec (env vars needed by runner.py)."""
+    env = _build_api_env_dict(
+        timeout_ms,
+        enable_skill=enable_skill,
+        enable_subagent=enable_subagent,
+    )
+    args = []
+    for k, v in env.items():
+        args.extend(['-e', f'{k}={v}'])
+    return args
+
+
+async def _upload_task(runtime, task: str, task_file_path: str | None):
+    """Upload task description to /agent/current_task.txt inside container."""
+    tmpdir = tempfile.mkdtemp(prefix='ae_task_')
+    try:
+        dest = os.path.join(tmpdir, 'current_task.txt')
+        if task_file_path and os.path.isfile(task_file_path):
+            shutil.copy2(task_file_path, dest)
+        else:
+            with open(dest, 'w', encoding='utf-8') as f:
+                f.write(task)
+        await runtime.upload(UploadRequest(source_path=tmpdir, target_path='/agent_task_file'))
+        await _run_bash(
+            runtime,
+            'cp /agent_task_file/current_task.txt /agent/current_task.txt',
+            10.0,
+        )
+    finally:
+        shutil.rmtree(tmpdir, ignore_errors=True)
+
+
+async def _setup_container_env(
+    runtime, timeout_ms: int, *, enable_skill: bool = False, enable_subagent: bool = False
+):
+    """Set timeout and API keys inside the container."""
+    env = _build_api_env_dict(
+        timeout_ms,
+        enable_skill=enable_skill,
+        enable_subagent=enable_subagent,
+    )
+    parts = [f"export {k}='{_shell_escape(v)}'" for k, v in env.items()]
+    await _run_bash(runtime, ' && '.join(parts))
+
+    if not has_api_key():
+        logging.warning('No API key found. Runner may fail.')
+
+
+def _extract_output(res) -> str:
+    """Extract output string from swe-rex/bash action result."""
+    return str(getattr(res, 'output', '')).strip()
+
+
+async def _run_bash(runtime, command: str, timeout: float = 10.0) -> str:
+    """Run a Bash command in the container session and return its output. Reduces duplication."""
+    res = await runtime.run_in_session(BashAction(command=command, timeout=timeout))
+    return _extract_output(res)
+
+
+async def _start_runner_background(runtime, model: str) -> str | None:
+    """Start runner.sh in background, return pid or None."""
+    await _run_bash(
+        runtime,
+        'rm -f /agent/runner.live.log && touch /agent/runner.live.log',
+        10.0,
+    )
+    output = await _run_bash(
+        runtime,
+        (
+            f'stdbuf -oL -eL /agent/runner.sh "{model}" /agent/current_task.txt '
+            f'> /agent/runner.live.log 2>&1 & '
+            f'RUNNER_PID=$!; sleep 1; echo RUNNER_PID=$RUNNER_PID'
+        ),
+        30.0,
+    )
+    pid = None
+    for line in output.split('\n'):
+        if 'RUNNER_PID=' in line:
+            pid = line.split('RUNNER_PID=', 1)[1].strip()
+            break
+    if not pid or not pid.strip().isdigit():
+        await asyncio.sleep(2)
+        pid = await _run_bash(
+            runtime,
+            "ps aux | grep '[r]unner.py' | awk '{print $2}' | head -1",
+            10.0,
+        )
+    pid = (pid or '').strip()
+    return pid if pid.isdigit() else None
+
+
+async def _read_runner_log(runtime, elapsed: float, last_log: str) -> str:
+    """Read live log and print new content. Returns updated last_log."""
+    try:
+        cur = await _run_bash(runtime, 'cat /agent/runner.live.log 2>/dev/null || echo ""', 30.0)
+        if cur and cur != last_log:
+            new = cur[len(last_log) :].strip() if cur.startswith(last_log) else cur
+            if new:
+                print(f'[log @ {elapsed:.0f}s]\n{new}', flush=True)
+            return cur
+    except (AttributeError, TypeError, ValueError) as e:
+        logging.debug('Log read error: %s', e)
+    return last_log
+
+
+async def _check_runner_exited(runtime, pid: str | None) -> _RunnerResult | None:
+    """Check if runner process has exited. Returns _RunnerResult if exited, else None."""
+    if pid and pid.isdigit():
+        ps_out = await _run_bash(runtime, f'ps -p {pid} >/dev/null 2>&1; echo $?', 10.0)
+        if ps_out.strip() != '0':
+            code = await _run_bash(runtime, f'wait {pid} 2>/dev/null; echo $?', 30.0)
+            ec = int(code.strip()) if code.strip().isdigit() else -1
+            return _RunnerResult(exit_code=ec, output=f'exit_code={ec}')
+    else:
+        # PID was never captured (e.g. RUNNER_PID parse failed); detect exit by process count.
+        cnt = await _run_bash(runtime, "ps aux | grep '[r]unner.py' | wc -l", 10.0)
+        if not cnt.strip().isdigit() or int(cnt.strip()) == 0:
+            return _RunnerResult(exit_code=-1, output='exit_code=unknown')
+    return None
+
+
+async def _handle_runner_timeout(runtime, pid: str | None) -> None:
+    """Kill runner and print log tail on timeout."""
+    if pid and pid.isdigit():
+        try:
+            await _run_bash(
+                runtime,
+                f'kill -TERM {pid} 2>/dev/null || kill -9 {pid} 2>/dev/null || true',
+                10.0,
+            )
+        except (AttributeError, TypeError, ConnectionError) as e:
+            logging.debug('Kill runner failed: %s', e)
+    try:
+        tail_str = await _run_bash(runtime, 'tail -n 200 /agent/runner.live.log', 30.0)
+        print(f'Log tail (timeout):\n{tail_str}', flush=True)
+    except (AttributeError, TypeError, ValueError) as e:
+        logging.debug('Could not read log tail: %s', e)
+
+
+async def _monitor_runner(runtime, model: str, timeout_s: float) -> _RunnerResult:
+    """Start runner.sh in background and poll logs until it finishes or times out."""
+    pid = await _start_runner_background(runtime, model)
+    print(f'Runner started (pid={pid})', flush=True)
+
+    start = time.monotonic()
+    last_log = ''
+    last_progress_at = 0.0
+
+    while True:
+        elapsed = time.monotonic() - start
+        if elapsed >= timeout_s:
+            break
+
+        last_log = await _read_runner_log(runtime, elapsed, last_log)
+        if elapsed - last_progress_at >= _PROGRESS_LOG_INTERVAL_SEC:
+            print(f'[still running @ {elapsed:.0f}s]', flush=True)
+            last_progress_at = elapsed
+
+        result = await _check_runner_exited(runtime, pid)
+        if result is not None:
+            print(f'Runner finished (exit_code={result.exit_code})', flush=True)
+            return result
+
+        await asyncio.sleep(_POLL_INTERVAL_SEC)
+
+    await _handle_runner_timeout(runtime, pid)
+    raise TimeoutError(f'Runner exceeded timeout {timeout_s}s')
+
+
+# ---------------------------------------------------------------------------
+# Docker mode
+# ---------------------------------------------------------------------------
+
+
+async def _run_interactive_in_container(
+    container_id: str,
+    task_id: str,
+    task: str,
+    project_path: str,
+    model: str,
+    timeout_ms: int,
+    *,
+    enable_skill: bool = False,
+    enable_subagent: bool = False,
+) -> dict:
+    """Run task + interactive in foreground via docker exec -it.
+
+    The same agent session handles both task and follow-up, preserving context.
+    """
+    print(
+        '\n'
+        + '=' * 60
+        + '\nTask + interactive mode (foreground, context preserved).\n'
+        + "Type 'quit' or 'exit' to end the interactive session.\n"
+        + '=' * 60,
+        flush=True,
+    )
+    exec_args = [
+        'docker',
+        'exec',
+        '-it',
+        *_docker_exec_env_args(
+            timeout_ms,
+            enable_skill=enable_skill,
+            enable_subagent=enable_subagent,
+        ),
+        container_id,
+        'python3',
+        '-u',
+        '/agent/runner.py',
+        model,
+        '/agent/current_task.txt',
+        '--interactive',
+    ]
+    try:
+        proc = await asyncio.to_thread(
+            subprocess.run,
+            exec_args,
+            stdin=sys.__stdin__,
+            stdout=sys.__stdout__,
+            stderr=sys.__stderr__,
+        )
+        run_exit_code = proc.returncode
+    except (OSError, subprocess.SubprocessError) as e:
+        logging.warning('Foreground execution failed for task %s: %s', task_id, e)
+        run_exit_code = 1
+
+    return _make_eval_result(
+        task_id,
+        task,
+        project_path,
+        f'Interactive session (exit_code={run_exit_code})',
+        status_from_exit_code(run_exit_code),
+        run_on_host=False,
+    )
+
+
+async def _run_in_docker(  # noqa: C901
+    deployment,
+    project_path,
+    task_id,
+    task,
+    model,
+    agent_path,
+    _save_path: str,
+    timeout_ms: int,
+    *,
+    task_file_path: str | None = None,
+    interactive: bool = False,
+    enable_skill: bool = False,
+    enable_subagent: bool = False,
+) -> dict:
+    """Run task inside a Docker container.
+
+    _save_path: Unused in Docker path (results are returned to main.py which writes reports).
+    Kept for a consistent run_eval() → _run_in_docker() API.
+    """
+    if not SWEREX_AVAILABLE:
+        raise RuntimeError('swerex is not available.')
+
+    _validate_agent_path(agent_path)
+    await deployment.start()
+    runtime = deployment.runtime
+
+    timeout_s = timeout_ms / 1000.0
+    # swe-rex doesn't expose a public API for session-level timeout;
+    # override the internal config as a workaround.
+    if hasattr(runtime, '_config'):
+        runtime._config.timeout = timeout_s
+
+    await runtime.create_session(CreateBashSessionRequest())
+
+    print('Uploading project files...', flush=True)
+    await runtime.upload(UploadRequest(source_path=project_path, target_path='/repo'))
+    await _run_bash(runtime, 'cd /repo')
+
+    print('Uploading agent scripts...', flush=True)
+    await runtime.upload(UploadRequest(source_path=agent_path, target_path='/agent'))
+    await _run_bash(
+        runtime,
+        'chmod +x /agent/runner.sh /agent/install.sh 2>/dev/null; /agent/install.sh',
+        120.0,  # install.sh may run pip install; allow up to 2 minutes
+    )
+
+    await _upload_task(runtime, task, task_file_path)
+    await _setup_container_env(
+        runtime, timeout_ms, enable_skill=enable_skill, enable_subagent=enable_subagent
+    )
+
+    container_id = await _get_container_id(runtime)
+    result = None
+
+    try:
+        # Prefer foreground interactive when container_id is available and stdin is a TTY.
+        if interactive and container_id and _stdin_is_tty():
+            result = await _run_interactive_in_container(
+                container_id, task_id, task, project_path, model, timeout_ms,
+                enable_skill=enable_skill, enable_subagent=enable_subagent,
+            )
+        else:
+            if interactive and not _stdin_is_tty():
+                print(
+                    'WARNING: Interactive mode requires a terminal (TTY). Running task in non-interactive mode.',
+                    flush=True,
+                )
+            elif interactive and not container_id:
+                print(
+                    'WARNING: Cannot get container ID; falling back to non-interactive mode.',
+                    flush=True,
+                )
+            # Background run: start runner, poll logs, then return result.
+            run_results = await _monitor_runner(runtime, model, timeout_s)
+            print(f'Runner result: {run_results}', flush=True)
+            result = _make_eval_result(
+                task_id,
+                task,
+                project_path,
+                run_results.output,
+                status_from_exit_code(run_results.exit_code),
+                run_on_host=False,
+            )
+    except Exception as e:
+        logging.error('Task %s error: %s', task_id, e, exc_info=True)
+        result = _make_eval_result(
+            task_id,
+            task,
+            project_path,
+            str(e),
+            'error',
+            run_on_host=False,
+        )
+    finally:
+        if not container_id:
+            container_id = await _get_container_id(runtime)
+
+        saved_image, stopped = None, False
+        if container_id:
+            try:
+                saved_image, stopped = _save_container(container_id, project_path, task_id)
+            except (OSError, subprocess.SubprocessError) as e:
+                logging.warning('Save container failed: %s', e)
+
+        try:
+            await deployment.stop()
+        except Exception as e:
+            # Container may already be stopped; deployment.close() can fail with
+            # ClientConnectorError when the remote service port is gone.
+            logging.warning('deployment.stop() failed for task %s: %s', task_id, e)
+
+        if result is None:
+            # Exception occurred before any result was set (e.g. before try body ran
+            # or a BaseException was raised). Ensure we always have a dict for update/return.
+            result = _make_eval_result(
+                task_id,
+                task,
+                project_path,
+                'Execution interrupted or failed before result was set.',
+                'error',
+                run_on_host=False,
+            )
+        result.update(
+            container_id=container_id,
+            saved_image=saved_image,
+            container_stopped=stopped,
+        )
+
+    return result
+
+
+# ---------------------------------------------------------------------------
+# Public entry point
+# ---------------------------------------------------------------------------
+
+
+def run_eval(
+    env: str,
+    project_path: str,
+    task_id: str,
+    task: str,
+    model: str,
+    agent_path: str,
+    save_path: str,
+    docker_image: str | None = None,
+    timeout_ms: int | None = None,
+    *,
+    skip_prereq_check: bool = False,
+    use_gpu: bool = False,
+    task_file_path: str | None = None,
+    interactive: bool = False,
+    enable_skill: bool = False,
+    enable_subagent: bool = False,
+) -> dict:
+    """Run task in the given environment (local host or Docker).
+
+    Single entry point — called from main.py for each JSONL task.
+    """
+    timeout_ms = resolve_timeout_ms(timeout_ms)
+    if is_local_env(env):
+        apply_timeout_env(timeout_ms)  # Docker mode uses container env only; no host env.
+        print(f'Task {task_id}: HOST (timeout={timeout_ms}ms, interactive={interactive})')
+        return asyncio.run(
+            _run_local(
+                project_path,
+                task_id,
+                task,
+                model,
+                timeout_ms,
+                skip_prereq_check=skip_prereq_check,
+                interactive=interactive,
+                enable_skill=enable_skill,
+                enable_subagent=enable_subagent,
+            )
+        )
+
+    if not SWEREX_AVAILABLE:
+        raise RuntimeError('SWE-ReX not available. Install swe-rex for Docker mode.')
+
+    image = docker_image or DEFAULT_DOCKER_IMAGE
+    docker_args = [
+        '--privileged',
+        '--cgroupns=host',
+        '-e',
+        'KIND_EXPERIMENTAL_CONTAINERD_SNAPSHOTTER=native',
+    ]
+    if use_gpu:
+        docker_args.extend(['--gpus', 'all'])
+
+    config = DockerDeploymentConfig(
+        image=image,
+        startup_timeout=1200.0,
+        docker_args=docker_args,
+    )
+    deployment = config.get_deployment()
+
+    gpu_note = ' (GPU)' if use_gpu else ''
+    print(f'Task {task_id}: DOCKER (image={image}, timeout={timeout_ms}ms){gpu_note}')
+    return asyncio.run(
+        _run_in_docker(
+            deployment,
+            project_path,
+            task_id,
+            task,
+            model,
+            agent_path,
+            save_path,
+            timeout_ms,
+            task_file_path=task_file_path,
+            interactive=interactive,
+            enable_skill=enable_skill,
+            enable_subagent=enable_subagent,
+        )
+    )
diff --git a/benchmarks/arteval_bench/src/agents/ae_agent/runner.py b/benchmarks/arteval_bench/src/agents/ae_agent/runner.py
new file mode 100644
index 00000000..4393e5e3
--- /dev/null
+++ b/benchmarks/arteval_bench/src/agents/ae_agent/runner.py
@@ -0,0 +1,486 @@
+#!/usr/bin/env python3
+"""Core agent execution using Claude Agent SDK.
+
+Works both as a package module (imported by run_eval for host mode) and as a
+standalone script (uploaded to Docker container and run via runner.sh).
+
+Provides:
+- build_system_prompt(): unified prompt builder for all environments
+- run_agent(): single implementation of SDK invocation with rate-limit retry
+- docker_main(): standalone Docker entry point
+"""
+
+from __future__ import annotations
+
+import asyncio
+import logging
+import os
+import re
+import sys
+
+logger = logging.getLogger(__name__)
+
+# Import utils: as package module or standalone in Docker.
+try:
+    from .utils import (
+        DEFAULT_MODEL,
+        DEFAULT_TIMEOUT_MS,
+        has_api_key,
+        is_local_env,
+        resolve_timeout_ms,
+    )
+except (ImportError, SystemError):
+    sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+    try:
+        from utils import (
+            DEFAULT_MODEL,
+            DEFAULT_TIMEOUT_MS,
+            has_api_key,
+            is_local_env,
+            resolve_timeout_ms,
+        )
+    except ImportError:
+        # Fallback when utils is not importable (e.g. container has only runner.py).
+        # Duplication intentional; single source is utils.py. Update both if default changes.
+        DEFAULT_TIMEOUT_MS = 345_600_000  # 96h
+        DEFAULT_MODEL = 'claude-sonnet-4-5-20250929'
+
+        def is_local_env(env: str) -> bool:  # noqa: D103
+            return str(env).strip().lower() == 'local'
+
+        def has_api_key() -> bool:  # noqa: D103
+            return bool(os.environ.get('ANTHROPIC_API_KEY') or os.environ.get('ANTHROPIC_FOUNDRY_API_KEY'))
+
+        def resolve_timeout_ms(timeout_ms: int | None) -> int:  # noqa: D103
+            return timeout_ms if timeout_ms is not None else DEFAULT_TIMEOUT_MS
+
+
+try:
+    from claude_agent_sdk import ClaudeAgentOptions, query
+
+    CLAUDE_SDK_AVAILABLE = True
+except ImportError:
+    CLAUDE_SDK_AVAILABLE = False
+
+try:
+    from claude_agent_sdk import ClaudeSDKClient
+except ImportError:
+    ClaudeSDKClient = None
+
+_RATE_LIMIT_MAX_RETRIES = 5
+_RATE_LIMIT_WAIT_SEC = 60
+_RATE_LIMIT_WAIT_MAX_SEC = 600
+_RATE_LIMIT_WRAPPED_MAX_RETRIES = 3
+_PROGRESS_LOG_INTERVAL = 10
+
+
+_RESULT_TYPE_NAMES = frozenset({'ResultMessage', 'TextBlock'})
+
+
+def _process_message(message, message_count: int, result_text: str) -> tuple[int, str]:
+    """Process one SDK message: print, update count, extract result text.
+
+    Returns (new_message_count, new_result_text).
+    """
+    message_count += 1
+    if message_count % _PROGRESS_LOG_INTERVAL == 0:
+        print(f'[Progress] {message_count} messages...', flush=True)
+    msg_str = str(message)
+    print(msg_str, flush=True)
+    if type(message).__name__ in _RESULT_TYPE_NAMES:
+        result_text = msg_str
+    return message_count, result_text
+
+
+def _is_rate_limit_error(exc: BaseException) -> bool:
+    msg = str(exc).lower()
+    return '429' in msg or 'rate limit' in msg or 'ratelimitreached' in msg
+
+
+def _is_possible_wrapped_rate_limit(exc: BaseException) -> bool:
+    msg = str(exc).lower()
+    return ('command failed' in msg and 'exit code 1' in msg) or 'check stderr' in msg
+
+
+def _parse_retry_after(exc: BaseException) -> int | None:
+    m = re.search(r'wait\s+(\d+)\s*seconds', str(exc), re.I)
+    return int(m.group(1)) if m else None
+
+
+def _parse_env_bool(env_var: str, default: bool = False) -> bool:
+    """Parse env var as bool. '1', 'true', 'yes' -> True."""
+    v = os.environ.get(env_var, '').strip().lower()
+    return v in ('1', 'true', 'yes') if v else default
+
+
+# Shared prompt fragments
+_PROMPT_TIMEOUT_HOST = (
+    'TIMEOUT CONFIGURATION (CRITICAL):\n'
+    '- Long-running commands (builds, tests, Kind cluster creation) are expected\n'
+    '- DO NOT set short timeouts - let commands complete naturally\n\n'
+)
+_PROMPT_TIMEOUT_DOCKER = (
+    'TIMEOUT CONFIGURATION (CRITICAL):\n'
+    '- The system has been configured with a Bash timeout of {timeout_ms} ms.\n'
+    '- DO NOT specify timeout parameters in your Bash commands.\n'
+    '- Long-running commands can take hours - this is normal.\n'
+    '- If a command seems to be running long, DO NOT cancel or re-run it.\n\n'
+)
+_PROMPT_VERIFY_STEPS = (
+    'You MUST execute every verification step the README requires. Do NOT skip steps because they take a long time.\n'
+)
+
+
+def build_system_prompt(
+    task: str,
+    *,
+    env: str = 'docker',
+    artifact_path: str | None = None,
+    timeout_ms: int | None = None,
+) -> str:
+    """Build system prompt, parameterized by execution environment.
+
+    Args:
+        task: Task description text.
+        env: 'local' for host execution, anything else for Docker.
+        artifact_path: Path to artifact directory (used in host mode prompt).
+        timeout_ms: Bash timeout in ms (shown in Docker mode prompt).
+    """
+    timeout_ms = resolve_timeout_ms(timeout_ms)
+
+    if is_local_env(env):
+        path = artifact_path or '.'
+        return (
+            'You are an experienced software engineer completing an artifact task.\n\n'
+            'ENVIRONMENT SETUP (HOST MACHINE):\n'
+            '- You are running DIRECTLY on the host machine (NOT inside a Docker container)\n'
+            '- Docker daemon is already running on this host\n'
+            '- You may need sudo for some operations\n\n'
+            f'ARTIFACT LOCATION:\n'
+            f'- The artifact repository is located at: {path}\n'
+            f'- Start by changing to this directory: cd {path}\n\n'
+            f'YOUR TASK:\n{task}\n\n' + _PROMPT_TIMEOUT_HOST + 'IMPORTANT GUIDELINES:\n'
+            f'1. First, cd to {path} and examine the directory structure\n'
+            '2. Follow the README instructions step by step\n'
+            f'3. {_PROMPT_VERIFY_STEPS}'
+            "4. If you see 'sudo' in instructions, you can use it (or skip if already root)\n"
+            '5. Use the Bash tool to run commands, Read tool to inspect files\n'
+            '6. Work systematically through setup, build, and experiment execution\n'
+            '7. If you encounter errors, debug and resolve them using available tools\n'
+            "8. For Kind clusters, they will work properly since you're on the host (not DinD)"
+        )
+
+    # Docker/container: when running under arteval_bench, artifact is at /repo
+    path_hint = ''
+    if artifact_path:
+        path_hint = f'- The artifact repository is at: {artifact_path}. Change to it: cd {artifact_path}\n'
+    else:
+        path_hint = (
+            '- The artifact repository should be in the current working directory or nearby.\n'
+            '- Explore the directory structure to find the artifact repository.\n'
+        )
+
+    return (
+        'You are an experienced software engineer.\n\n'
+        'ENVIRONMENT SETUP:\n'
+        '- You are running inside a Docker container with root permissions.\n'
+        f'{path_hint}'
+        '- You have access to Read, Write, and Bash tools.\n\n'
+        f'YOUR TASK:\n{task}\n\n' + _PROMPT_TIMEOUT_DOCKER.format(timeout_ms=timeout_ms) + 'IMPORTANT GUIDELINES:\n'
+        '1. First, explore the current directory structure\n'
+        '2. Navigate to the artifact repository root directory\n'
+        "3. If you see 'sudo' in instructions, remove it (you already have root access)\n"
+        '4. Do NOT attempt to switch git branches\n'
+        '5. Follow the README instructions step by step\n'
+        f'6. {_PROMPT_VERIFY_STEPS}'
+        '7. Use the Bash, Read, and Write tools to complete the task\n'
+        '8. Work systematically through setup, build, and experiment execution\n'
+        '9. If you encounter errors, debug and resolve them'
+    )
+
+
+async def run_agent(  # noqa: C901
+    model_name: str,
+    task: str,
+    *,
+    system_prompt: str | None = None,
+    env: str = 'docker',
+    artifact_path: str | None = None,
+    timeout_ms: int | None = None,
+    interactive: bool = False,
+    enable_skill: bool = False,
+    enable_subagent: bool = False,
+) -> dict:
+    """Run the agent using Claude SDK. Single implementation for all modes.
+
+    Args:
+        model_name: Claude model name (e.g. claude-sonnet-4-5-20250929)
+        task: Task description
+        system_prompt: If provided, use directly; otherwise built from env/artifact_path/task.
+        env: 'local' for host, else docker. Used to build prompt when system_prompt is None.
+        artifact_path: Artifact directory path (for prompt and initial message).
+        timeout_ms: Bash timeout in ms.
+        interactive: If True, enter interactive multi-turn loop after initial task.
+        enable_skill: If True, enable Claude Agent SDK Skill (load from ~/.claude/skills/).
+        enable_subagent: If True, enable Claude Agent SDK Sub-agent (Task tool).
+
+    Returns:
+        dict with keys: exit_code (int), output (str), message_count (int)
+    """
+    if not CLAUDE_SDK_AVAILABLE:
+        raise RuntimeError('claude_agent_sdk is not available. Install with: pip install claude-agent-sdk')
+
+    timeout_ms = resolve_timeout_ms(timeout_ms)
+    if system_prompt is None:
+        system_prompt = build_system_prompt(task, env=env, artifact_path=artifact_path, timeout_ms=timeout_ms)
+
+    allowed_tools = ['Read', 'Write', 'Bash']
+    if enable_skill:
+        allowed_tools.append('Skill')
+    if enable_subagent:
+        allowed_tools.append('Task')
+    setting_sources = ['user', 'project'] if enable_skill else ['user']
+
+    options = ClaudeAgentOptions(
+        model=model_name,
+        system_prompt=system_prompt,
+        allowed_tools=allowed_tools,
+        setting_sources=setting_sources,
+    )
+
+    initial_prompt = (
+        f'Please start the artifact task. Begin by changing to the artifact '
+        f'directory at {artifact_path} and examining its contents.'
+        if artifact_path
+        else 'Please start working on the artifact task. Begin by examining '
+        'the current directory and finding the artifact repository.'
+    )
+
+    print(f'\n{"=" * 60}', flush=True)
+    print(f'Starting Claude Agent SDK with model: {model_name}', flush=True)
+    print(f'{"=" * 60}\n', flush=True)
+
+    message_count = 0
+    result_text = ''
+
+    if interactive:
+        if ClaudeSDKClient is None:
+            raise RuntimeError('ClaudeSDKClient not available; cannot run interactive mode.')
+        async with ClaudeSDKClient(options=options) as client:
+            await client.query(initial_prompt)
+            async for message in client.receive_response():
+                message_count, result_text = _process_message(message, message_count, result_text)
+
+            print(f'\nInitial task done ({message_count} messages).', flush=True)
+            print('\n' + '=' * 60, flush=True)
+            print(
+                "Interactive mode — type instructions (or 'quit'/'exit' to end).",
+                flush=True,
+            )
+            print('=' * 60 + '\n', flush=True)
+
+            while True:
+                try:
+                    user_input = input('\n>>> ').strip()
+                except (EOFError, KeyboardInterrupt):
+                    print('\nExiting interactive mode.', flush=True)
+                    break
+                if not user_input:
+                    continue
+                if user_input.lower() in ('quit', 'exit', 'q'):
+                    print('Exiting interactive mode.', flush=True)
+                    break
+                await client.query(user_input)
+                async for msg in client.receive_response():
+                    message_count, result_text = _process_message(msg, message_count, result_text)
+
+        return {
+            'exit_code': 0 if message_count > 0 else 1,
+            'output': result_text,
+            'message_count': message_count,
+        }
+
+    # Non-interactive with rate-limit retry
+    last_exception = None
+    for attempt in range(1, _RATE_LIMIT_MAX_RETRIES + 1):
+        try:
+            result_text = ''
+            message_count = 0
+            async for message in query(prompt=initial_prompt, options=options):
+                message_count, result_text = _process_message(message, message_count, result_text)
+
+            print(f'Completed. Total messages: {message_count}', flush=True)
+            return {
+                'exit_code': 0,
+                'output': result_text,
+                'message_count': message_count,
+            }
+
+        except asyncio.TimeoutError as e:
+            logger.error('Timed out: %s', e)
+            return {
+                'exit_code': 1,
+                'output': f'Timeout: {e}',
+                'message_count': message_count,
+            }
+        except Exception as e:
+            last_exception = e
+            explicit = _is_rate_limit_error(e)
+            wrapped = _is_possible_wrapped_rate_limit(e) and not explicit
+            max_r = _RATE_LIMIT_MAX_RETRIES if explicit else _RATE_LIMIT_WRAPPED_MAX_RETRIES
+            if (explicit or wrapped) and attempt < max_r:
+                parsed = _parse_retry_after(e)
+                wait = (
+                    min(parsed, _RATE_LIMIT_WAIT_MAX_SEC)
+                    if parsed
+                    else min(
+                        _RATE_LIMIT_WAIT_SEC * (2 ** (attempt - 1)),
+                        _RATE_LIMIT_WAIT_MAX_SEC,
+                    )
+                )
+                logger.warning(
+                    'Rate limit. Waiting %ds (attempt %d/%d)...',
+                    wait,
+                    attempt,
+                    max_r,
+                )
+                await asyncio.sleep(wait)
+                continue
+            logger.error('%s', e, exc_info=True)
+            return {
+                'exit_code': 1,
+                'output': f'Error: {e}',
+                'message_count': message_count,
+            }
+
+    return {
+        'exit_code': 1,
+        'output': f'Failed after {_RATE_LIMIT_MAX_RETRIES} attempts: {last_exception}',
+        'message_count': 0,
+    }
+
+
+# ---------------------------------------------------------------------------
+# Standalone entry point (Docker container via runner.sh)
+# ---------------------------------------------------------------------------
+
+
+def _ensure_api_key() -> None:
+    """Ensure at least one API key is set; exit with error otherwise."""
+    if has_api_key():
+        return
+    logger.error('API key not set. Set ANTHROPIC_API_KEY or ANTHROPIC_FOUNDRY_API_KEY.')
+    sys.exit(1)
+
+
+_INTERACTIVE_SYSTEM_PROMPT = """\
+You are an experienced software engineer in an interactive session.
+
+ENVIRONMENT:
+- You are inside a Docker container with root permissions.
+- The artifact repository is at /repo. Change to it: cd /repo
+- You have access to Read, Write, and Bash tools.
+
+TIMEOUT: Long-running commands can take hours; do not set short timeouts.
+
+You will receive follow-up instructions from the user. Complete each one and respond.
+If the user asks to stop or says 'quit'/'exit', acknowledge and they will end the session."""
+
+# When running under arteval_bench, artifact is always at /repo
+_ARTIFACT_PATH_IN_CONTAINER = '/repo'
+
+
+def docker_main():
+    """Standalone entry point for running inside a Docker container via runner.sh."""
+    raw_args = sys.argv[1:]
+    interactive = '--interactive' in raw_args
+    args = [a for a in raw_args if a != '--interactive']
+
+    enable_skill = _parse_env_bool('AE_ENABLE_SKILL', False)
+    enable_subagent = _parse_env_bool('AE_ENABLE_SUBAGENT', False)
+
+    # Mode 1 — interactive-only (no task): runner.py --interactive [model]
+    if interactive and len(args) <= 1:
+        model = args[0] if args else os.environ.get('AE_AGENT_MODEL', DEFAULT_MODEL)
+        _ensure_api_key()
+        result = asyncio.run(
+            run_agent(
+                model,
+                'Please confirm you are in /repo and ready for follow-up instructions. Reply briefly.',
+                system_prompt=_INTERACTIVE_SYSTEM_PROMPT,
+                interactive=True,
+                enable_skill=enable_skill,
+                enable_subagent=enable_subagent,
+            )
+        )
+        sys.exit(result['exit_code'])
+
+    # Mode 2 — task execution: runner.py <model> <task_or_path> [--interactive]
+    if len(args) != 2:
+        print(
+            'Usage: python3 runner.py <model> <task_or_path> [--interactive]\n'
+            '       python3 runner.py --interactive [model]',
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    model_name = args[0]
+    task_arg = args[1]
+    if os.path.isfile(task_arg):
+        with open(task_arg, encoding='utf-8') as f:
+            task = f.read()
+    else:
+        task = task_arg
+
+    _ensure_api_key()
+
+    try:
+        raw = os.environ.get('BASH_MAX_TIMEOUT_MS')
+        timeout_ms = int(raw) if raw else None
+    except ValueError:
+        timeout_ms = None
+    timeout_ms = resolve_timeout_ms(timeout_ms)
+
+    # In container (arteval_bench): artifact is at /repo
+    artifact_path = _ARTIFACT_PATH_IN_CONTAINER if os.path.isdir(_ARTIFACT_PATH_IN_CONTAINER) else None
+
+    try:
+        if interactive:
+            result = asyncio.run(
+                run_agent(
+                    model_name,
+                    task,
+                    env='docker',
+                    artifact_path=artifact_path,
+                    timeout_ms=timeout_ms,
+                    interactive=True,
+                    enable_skill=enable_skill,
+                    enable_subagent=enable_subagent,
+                )
+            )
+        else:
+            result = asyncio.run(
+                asyncio.wait_for(
+                    run_agent(
+                        model_name,
+                        task,
+                        env='docker',
+                        artifact_path=artifact_path,
+                        timeout_ms=timeout_ms,
+                        enable_skill=enable_skill,
+                        enable_subagent=enable_subagent,
+                    ),
+                    timeout=timeout_ms / 1000.0,
+                )
+            )
+        sys.exit(result['exit_code'])
+    except asyncio.TimeoutError:
+        logger.error('Agent exceeded timeout.')
+        sys.exit(1)
+    except Exception as e:
+        logger.error('%s', e, exc_info=True)
+        sys.exit(1)
+
+
+if __name__ == '__main__':
+    docker_main()
diff --git a/benchmarks/arteval_bench/src/agents/ae_agent/runner.sh b/benchmarks/arteval_bench/src/agents/ae_agent/runner.sh
new file mode 100644
index 00000000..adf9bc69
--- /dev/null
+++ b/benchmarks/arteval_bench/src/agents/ae_agent/runner.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+# Do not use set -e; some commands may return non-zero without indicating failure
+
+# Set the model and task as parameters (task can be text or path to file, e.g. /agent/current_task.txt)
+if [ $# -ne 2 ]; then
+    echo "Usage: $0 <model_location> <task_description_or_path>"
+    echo "Example: $0 claude-sonnet-4-5-20250929 \"Install and run tests\""
+    echo "         $0 claude-sonnet-4-5-20250929 /agent/current_task.txt"
+    exit 1
+fi
+
+# Disable Python buffering for real-time log output
+export PYTHONUNBUFFERED=1
+
+# Claude Agent SDK Bash timeout: use env if set, else default 96h (must match Python utils.DEFAULT_TIMEOUT_MS = 345_600_000)
+if [ -z "$BASH_MAX_TIMEOUT_MS" ]; then
+    export BASH_MAX_TIMEOUT_MS=345600000
+fi
+if [ -z "$BASH_DEFAULT_TIMEOUT_MS" ]; then
+    export BASH_DEFAULT_TIMEOUT_MS="$BASH_MAX_TIMEOUT_MS"
+fi
+
+# Invoke Python runner (-u for unbuffered output)
+python3 -u /agent/runner.py "$1" "$2"
diff --git a/benchmarks/arteval_bench/src/agents/ae_agent/utils.py b/benchmarks/arteval_bench/src/agents/ae_agent/utils.py
new file mode 100644
index 00000000..89497b2f
--- /dev/null
+++ b/benchmarks/arteval_bench/src/agents/ae_agent/utils.py
@@ -0,0 +1,451 @@
+"""Helper methods for running artifact tasks."""
+
+from __future__ import annotations
+
+import json
+import os
+import re
+import subprocess
+
+__all__ = [
+    'AGENT_SUMMARY_FALLBACK_MAX',
+    'DEFAULT_DOCKER_IMAGE',
+    'DEFAULT_MODEL',
+    'DEFAULT_TIMEOUT_MS',
+    'LOG_OUTPUT_TRUNCATE_BYTES',
+    'SUMMARY_BASENAME_TEMPLATE',
+    'SUMMARY_INSTRUCTION',
+    'Tee',
+    'apply_timeout_env',
+    'clone_artifact_repo',
+    'compute_and_write_summary',
+    'parse_artifact_url',
+    'docker_image_from_item',
+    'env_from_item',
+    'get_task',
+    'gpu_from_item',
+    'has_api_key',
+    'interactive_from_item',
+    'enable_skill_from_item',
+    'enable_subagent_from_item',
+    'is_local_env',
+    'parse_eval_score',
+    'read_task_from_file',
+    'resolve_project_path',
+    'resolve_timeout_ms',
+    'safe_task_id',
+    'status_from_exit_code',
+    'timeout_env_dict',
+    'timeout_ms_from_item',
+    'write_task_report',
+]
+
+# Default total timeout in milliseconds (96h); used by run_eval and runner.
+# Single source: runner.py fallback and runner.sh (345600000) must match when utils is unavailable.
+DEFAULT_TIMEOUT_MS = 345_600_000
+
+# Default Docker image and model when not specified.
+DEFAULT_DOCKER_IMAGE = 'bastoica/ae-agent-ubuntu24.04:latest'
+DEFAULT_MODEL = 'claude-sonnet-4-5-20250929'
+
+# File naming templates for reports and summaries.
+SUMMARY_BASENAME_TEMPLATE = 'ae_summary_{safe_id}.md'
+SUMMARY_INSTRUCTION = (
+    '\n\nAt the end, write a brief summary of what you did and the result to '
+    '{basename} in the artifact root (so it can be included in the report).'
+)
+LOG_OUTPUT_TRUNCATE_BYTES = 50000
+AGENT_SUMMARY_FALLBACK_MAX = 8000
+
+
+def timeout_env_dict(timeout_ms: int) -> dict[str, str]:
+    """Return env vars dict for Bash timeout (single source for env and settings file)."""
+    return {
+        'BASH_MAX_TIMEOUT_MS': str(timeout_ms),
+        'BASH_DEFAULT_TIMEOUT_MS': str(timeout_ms),
+    }
+
+
+def apply_timeout_env(timeout_ms: int) -> None:
+    """Set BASH_MAX_TIMEOUT_MS and BASH_DEFAULT_TIMEOUT_MS in os.environ."""
+    os.environ.update(timeout_env_dict(timeout_ms))
+
+
+def resolve_timeout_ms(timeout_ms: int | None) -> int:
+    """Return timeout_ms if set, else DEFAULT_TIMEOUT_MS. Single place for default."""
+    return timeout_ms if timeout_ms is not None else DEFAULT_TIMEOUT_MS
+
+
+def has_api_key() -> bool:
+    """True if at least one of ANTHROPIC_API_KEY or ANTHROPIC_FOUNDRY_API_KEY is set."""
+    return bool(os.environ.get('ANTHROPIC_API_KEY') or os.environ.get('ANTHROPIC_FOUNDRY_API_KEY'))
+
+
+def status_from_exit_code(exit_code: int) -> str:
+    """Map process exit code to eval status string. Non-zero (incl. -1 for unknown) → 'error'."""
+    return 'success' if exit_code == 0 else 'error'
+
+
+def is_local_env(env: str) -> bool:
+    """True if env denotes local (host) execution rather than Docker."""
+    return str(env).strip().lower() == 'local'
+
+
+def _parse_bool_value(v, default: bool = False) -> bool:
+    """Parse a value (bool, str, or other) to bool. Strings 'true', '1', 'yes' → True."""
+    if isinstance(v, bool):
+        return v
+    if isinstance(v, str):
+        return v.strip().lower() in ('true', '1', 'yes')
+    return bool(v)
+
+
+# Default task template when artifact_readme is not specified.
+_DEFAULT_TASK_TEMPLATE = (
+    'You are an experienced software engineer.'
+    ' You are asked to navigate to the {file_path} and follow step-by-step'
+    ' instructions to set up, install, compile, and reproduce the results in'
+    ' that code repository. You have root access inside a Docker image, which'
+    ' means you can directly proceed with executing the steps in the README'
+    ' without asking for approval or confirmation. Once you reached the end'
+    ' of the README you must exit the Docker image gracefully.'
+)
+
+
+def interactive_from_item(item: dict) -> bool:
+    """Whether to enable interactive mode (user can continue giving agent instructions after task completes)."""
+    return _parse_bool_value(item.get('interactive', False))
+
+
+def enable_skill_from_item(item: dict, default: bool = False) -> bool:
+    """Whether to enable Claude Agent SDK Skill (load from ~/.claude/skills/ and .claude/skills/)."""
+    return _parse_bool_value(item.get('enable_skill', default))
+
+
+def enable_subagent_from_item(item: dict, default: bool = False) -> bool:
+    """Whether to enable Claude Agent SDK Sub-agent (Task tool)."""
+    return _parse_bool_value(item.get('enable_subagent', default))
+
+
+def safe_task_id(task_id: str | None, fallback: str = 'unknown') -> str:
+    """Normalize task_id for use in filenames (no spaces, lowercase)."""
+    return (task_id or fallback).replace(' ', '_').lower()
+
+
+def timeout_ms_from_item(item: dict) -> int | None:
+    """Parse timeout from task item. Returns ms (int) or None for default.
+
+    Accepts either ``timeout_sec`` (seconds, preferred) or ``timeout_ms``
+    (milliseconds). Falls back to the legacy ``timeout`` field, which is
+    treated as seconds if < 86_400 (24 hours), otherwise milliseconds.
+    """
+    if 'timeout_sec' in item:
+        v = item['timeout_sec']
+        if isinstance(v, (int, float)):
+            return int(v * 1000)
+        return None
+    if 'timeout_ms' in item:
+        v = item['timeout_ms']
+        if isinstance(v, (int, float)):
+            return int(v)
+        return None
+    v = item.get('timeout', None)
+    if v is None:
+        return None
+    if isinstance(v, (int, float)):
+        # Legacy heuristic: 86400 = 24h in seconds; values below are treated as
+        # seconds, else as milliseconds (e.g. 345600000 = 96h).
+        return int(v * 1000) if v < 86_400 else int(v)
+    return None
+
+
+def env_from_item(item: dict) -> str:
+    """Resolve env from task item: 'local' = host, else = docker. Backward compat: run_on_host/docker_env."""
+    env = item.get('env', None)
+    if env is not None:
+        s = str(env).strip().lower()
+        return 'local' if s == 'local' else (str(env).strip() or 'docker')
+    return 'local' if item.get('run_on_host', False) else 'docker'
+
+
+def gpu_from_item(item: dict) -> bool:
+    """Whether to enable GPU access in Docker. Default False (no host GPU passed to container)."""
+    return _parse_bool_value(item.get('gpu', False))
+
+
+def docker_image_from_item(
+    item: dict,
+    default: str | None = None,
+    *,
+    env: str | None = None,
+) -> str | None:
+    """Resolve Docker image from task item.
+
+    When env is 'local', returns None (no Docker). Otherwise returns, in order:
+    item['env'] if it looks like an image name, item['docker_env'], or default.
+    If env is provided (e.g. from env_from_item), avoids parsing env twice.
+    """
+    resolved = (env if env is not None else env_from_item(item)).strip().lower()
+    if resolved == 'local':
+        return None
+    env_val = item.get('env', None)
+    if env_val is not None:
+        s = str(env_val).strip()
+        if s and s.lower() != 'local':
+            return s
+    return (
+        item.get('docker_env', None)
+        or item.get('docer_env', None)
+        or (default or DEFAULT_DOCKER_IMAGE)
+    )
+
+
+def get_task(file_path: str) -> str:
+    """Get agent task from a file path.
+
+    Args:
+        file_path: Path to README or task description file (relative to artifact root)
+
+    Returns:
+        Task description string for the agent
+    """
+    return _DEFAULT_TASK_TEMPLATE.format(file_path=file_path)
+
+
+def read_task_from_file(artifact_path: str, task_file: str) -> str:
+    """Read task description from a file.
+
+    Args:
+        artifact_path: Path to artifact root directory
+        task_file: Relative path to task file (e.g., README.md)
+
+    Returns:
+        Content of the task file as string
+    """
+    task_file_path = os.path.join(artifact_path, task_file)
+    if os.path.exists(task_file_path):
+        with open(task_file_path, encoding='utf-8') as f:
+            return f.read()
+    else:
+        return get_task(task_file)
+
+
+def parse_artifact_url(artifact_url: str) -> tuple[str, str | None]:
+    """Parse artifact URL into (clone_url, branch) for git clone.
+
+    Supports GitHub-style URLs:
+    - https://github.com/org/repo -> (https://github.com/org/repo.git, None)
+    - https://github.com/org/repo/tree/branch -> (https://github.com/org/repo.git, branch)
+    """
+    url = (artifact_url or '').strip()
+    if not url:
+        return url, None
+    # .../tree/<branch> or .../tree/<branch>/
+    tree_match = re.search(r'^(.*?)/tree/([^/#]+?)/?$', url)
+    if tree_match:
+        base, branch = tree_match.group(1), tree_match.group(2).strip()
+        if not base.endswith('.git'):
+            base = base.rstrip('/') + '.git'
+        return base, branch if branch else None
+    if not url.endswith('.git'):
+        url = url.rstrip('/') + '.git'
+    return url, None
+
+
+def clone_artifact_repo(artifact_url: str, target_dir: str, branch: str | None = None) -> str:
+    """Clone artifact repository from URL into target_dir.
+
+    Args:
+        artifact_url: Git clone URL (e.g. https://github.com/org/repo or .../repo/tree/branch).
+        target_dir: Absolute path to the directory to clone into (must not exist or be empty).
+        branch: Optional branch to clone. If None, parse_artifact_url(artifact_url) is used.
+
+    Returns:
+        target_dir (artifact root path after clone).
+
+    Raises:
+        RuntimeError: If git clone fails.
+    """
+    if os.path.exists(target_dir) and os.listdir(target_dir):
+        return target_dir
+    if os.path.exists(target_dir):
+        os.rmdir(target_dir)
+    clone_url, parsed_branch = parse_artifact_url(artifact_url)
+    use_branch = branch if branch is not None else parsed_branch
+    cmd = ['git', 'clone', '--depth', '1']
+    if use_branch:
+        cmd.extend(['-b', use_branch])
+    cmd.extend([clone_url, target_dir])
+    r = subprocess.run(
+        cmd,
+        capture_output=True,
+        text=True,
+        timeout=600,
+    )
+    if r.returncode != 0:
+        raise RuntimeError(f'git clone failed: {r.stderr or r.stdout}')
+    return target_dir
+
+
+def resolve_project_path(item: dict, input_file: str, save_path: str) -> tuple[str | None, str | None]:
+    """Resolve artifact project path from task item.
+
+    When both artifact_url and artifact_dir are set, if the local path
+    (input_dir/artifact_dir) already exists, it is used and no clone is performed.
+    Otherwise the repo is cloned from artifact_url into save_path/workspace/<task_id>.
+
+    Returns:
+        (project_path, error_message). If error_message is not None, skip task.
+    """
+    input_dir = os.path.dirname(os.path.abspath(input_file))
+    artifact_dir = item.get('artifact_dir')
+    artifact_url = item.get('artifact_url')
+    task_id = item.get('artifact_id')
+    sid = safe_task_id(task_id)
+
+    if artifact_url:
+        candidate = os.path.join(input_dir, artifact_dir) if artifact_dir else None
+        if candidate and os.path.isdir(candidate):
+            return os.path.abspath(candidate), None
+        workspace_dir = os.path.join(save_path, 'workspace', sid)
+        os.makedirs(os.path.dirname(workspace_dir), exist_ok=True)
+        return clone_artifact_repo(artifact_url, workspace_dir), None
+    if not artifact_dir:
+        return None, f'Skipping task {task_id}: missing artifact_dir and artifact_url'
+    path = os.path.abspath(os.path.join(input_dir, artifact_dir))
+    if not os.path.isdir(path):
+        return None, f'Project path does not exist: {path}'
+    return path, None
+
+
+class Tee:
+    """Write to both an original stream and a log file.
+
+    Implements enough of the TextIO interface to serve as a drop-in
+    replacement for sys.stdout / sys.stderr (supports libraries that
+    probe encoding, isatty, etc.).
+    """
+
+    def __init__(self, stream, log_path: str):
+        """Wrap stream and log_path for dual write."""
+        self._stream = stream
+        self._path = log_path
+        self._file = None
+
+    def __enter__(self):
+        """Open log file and return self."""
+        self._file = open(self._path, 'a', encoding='utf-8')
+        return self
+
+    def __exit__(self, *args):
+        """Close log file."""
+        if self._file:
+            self._file.close()
+
+    def write(self, data):
+        """Write to both stream and log file."""
+        self._stream.write(data)
+        if self._file:
+            self._file.write(data)
+            self._file.flush()
+
+    def flush(self):
+        """Flush both stream and log file."""
+        self._stream.flush()
+        if self._file:
+            self._file.flush()
+
+    @property
+    def encoding(self) -> str:
+        """Return underlying stream encoding or utf-8."""
+        return getattr(self._stream, 'encoding', 'utf-8')
+
+    def isatty(self) -> bool:
+        """Return whether underlying stream is a TTY."""
+        return getattr(self._stream, 'isatty', lambda: False)()
+
+    def fileno(self) -> int:
+        """Return underlying stream fileno."""
+        return self._stream.fileno()
+
+
+def write_task_report(
+    save_path: str,
+    safe_id: str,
+    task_id: str,
+    result: dict,
+    log_path: str,
+    agent_summary: str,
+) -> None:
+    """Write ae_report_<safe_id>.md for a single task."""
+    report_path = os.path.join(save_path, f'ae_report_{safe_id}.md')
+    saved_image = result.get('saved_image')
+    with open(report_path, 'w', encoding='utf-8') as fw:
+        fw.write(f'# AE Report: {task_id}\n\n')
+        fw.write(f'- **Status**: {result.get("status", "unknown")}\n')
+        fw.write(f'- **Timestamp**: {result.get("timestamp", "")}\n')
+        fw.write(f'- **Project path**: {result.get("project_path", "")}\n')
+        fw.write(f'- **Run on host**: {result.get("run_on_host", False)}\n')
+        fw.write(f'- **Log file**: `{log_path}`\n\n')
+        if saved_image:
+            fw.write('> [!Note]\n')
+            fw.write('> ## To check the result\n')
+            fw.write('>\n')
+            fw.write('> You can run the following command to manually check the result:\n')
+            fw.write('>\n')
+            fw.write('> ```bash\n')
+            fw.write(f'> docker run -it {saved_image} bash\n')
+            fw.write('> ```\n')
+            fw.write('>\n')
+            fw.write(f'> Image: `{saved_image}`\n\n')
+        fw.write('## Agent summary\n\n')
+        fw.write(agent_summary)
+        fw.write('\n')
+
+
+def parse_eval_score(output) -> int:
+    """Parse evaluation score from evaluator script output (string or object with .output).
+
+    - If a line is a single digit (e.g. '4', '0'), use it (prefer last such line).
+    - If output contains 'Agent scores: {...}' (Oracle-style evaluator), count ': 1' as passed items.
+    - Otherwise return 0.
+    """
+    s = (getattr(output, 'output', None) or str(output) or '').strip()
+    if not s:
+        return 0
+    lines = s.splitlines()
+    for line in reversed(lines):
+        t = line.strip()
+        if t.isdigit():
+            return int(t)
+    m = re.search(r'Agent scores:\s*\{[^}]*\}', s)
+    if m:
+        return m.group(0).count(': 1')
+    return 0
+
+
+def compute_and_write_summary(save_path: str) -> tuple[int, int]:
+    """Read result.jsonl, compute total/success, write summary.json.
+
+    total = number of result lines (success + error + skipped). success = status == "success".
+    Returns (total_count, success_count).
+    """
+    result_path = os.path.join(save_path, 'result.jsonl')
+    total, success = 0, 0
+    if os.path.isfile(result_path):
+        with open(result_path, encoding='utf-8') as f:
+            for line in f:
+                if not line.strip():
+                    continue
+                try:
+                    row = json.loads(line.strip())
+                    total += 1
+                    if row.get('status') == 'success':
+                        success += 1
+                except json.JSONDecodeError:
+                    continue
+    rate = success / total if total > 0 else 0.0
+    summary = {'total_tasks': total, 'successful_tasks': success, 'success_rate': rate}
+    with open(os.path.join(save_path, 'summary.json'), 'w', encoding='utf-8') as f:
+        json.dump(summary, f, indent=4)
+    return total, success
diff --git a/benchmarks/arteval_bench/src/main.py b/benchmarks/arteval_bench/src/main.py
new file mode 100644
index 00000000..75222211
--- /dev/null
+++ b/benchmarks/arteval_bench/src/main.py
@@ -0,0 +1,246 @@
+"""This script runs a benchmark for evaluating patches in a software project."""
+
+import argparse
+import json
+import os
+import sys
+from datetime import datetime
+
+_src_dir = os.path.dirname(os.path.abspath(__file__))
+if _src_dir not in sys.path:
+    sys.path.insert(0, _src_dir)
+sys.path.append(os.path.abspath(os.path.join(_src_dir, '../../../')))
+
+from sdk.logger import logger
+from sdk.utils import set_llm_endpoint_from_config
+
+set_llm_endpoint_from_config('env.toml')
+
+from run_eval_in_env import run_eval
+from utils import get_task
+
+from agents.ae_agent.utils import (
+    enable_skill_from_item,
+    enable_subagent_from_item,
+    gpu_from_item,
+    interactive_from_item,
+    resolve_project_path,
+    safe_task_id,
+    timeout_ms_from_item,
+    write_task_report,
+    compute_and_write_summary,
+)
+
+
+def _persist_skipped(save_path: str, task_id: str, message: str, expected_score: int = -1) -> None:
+    """Append one result line for a skipped task so summary total is accurate (same as ae-agent)."""
+    result = {
+        'task_id': task_id,
+        'status': 'skipped',
+        'message': message,
+        'expected_score': expected_score,
+    }
+    with open(os.path.join(save_path, 'result.jsonl'), 'a+', encoding='utf-8') as fw:
+        fw.write(json.dumps(result, ensure_ascii=False) + '\n')
+
+
+def _parse_bool(v, default=False):
+    if isinstance(v, bool):
+        return v
+    if isinstance(v, str):
+        return v.strip().lower() in ('true', '1', 'yes')
+    return bool(v) if v is not None else default
+
+
+def _is_ae_agent(agent):
+    """True if agent path points to the ae_agent (for report/summary writing)."""
+    if not agent:
+        return False
+    return 'ae_agent' in agent or os.path.basename(agent) == 'ae_agent'
+
+
+def main(file_path, model, agent, save_path, interactive_default=False, enable_skill_default=False, enable_subagent_default=False):
+    """Main function for running the benchmark."""
+    logger.info(f'Using model: {model}, agent: {agent}')
+    with open(file_path) as f:
+        for line in f:
+            if not line.strip():
+                continue  # Skip empty lines
+
+            try:
+                item = json.loads(line)
+            except json.JSONDecodeError:
+                logger.info(f'Skipping invalid JSON line: {line}')
+                continue
+
+            env_val = item.get('env', None)
+            if env_val is not None:
+                s = str(env_val).strip().lower()
+                if s == 'local':
+                    run_on_host = True
+                    deployment = None
+                else:
+                    run_on_host = False
+                    deployment = str(env_val).strip() or None
+            else:
+                deployment = item.get('docker_env', None) or item.get('docer_env', None)
+                run_on_host = item.get('run_on_host', False)
+            task_id = item.get('artifact_id', None)
+            project_path, path_error = resolve_project_path(item, file_path, save_path)
+            if path_error:
+                logger.info(f"Task {task_id}: {path_error}")
+                _persist_skipped(
+                    save_path,
+                    task_id or safe_task_id(task_id),
+                    path_error,
+                    item.get('expected_score', -1),
+                )
+                continue
+            task_file = item.get('artifact_readme', None)
+            test_method = item.get('evaluator', None)
+
+            timeout_ms = timeout_ms_from_item(item)
+            gpu = gpu_from_item(item)
+            interactive = interactive_from_item(item) or interactive_default
+            enable_skill = enable_skill_from_item(item, enable_skill_default)
+            enable_subagent = enable_subagent_from_item(item, enable_subagent_default)
+            keep_container = _parse_bool(item.get('keep_container'), False)
+
+            task = get_task(task_file)
+
+            logger.info(
+                f"Task {task_id}: project_path={project_path}, run_on_host={run_on_host}, "
+                f"timeout_ms={timeout_ms}, gpu={gpu}, interactive={interactive}, "
+                f"enable_skill={enable_skill}, enable_subagent={enable_subagent}, keep_container={keep_container}"
+            )
+
+            result = run_eval(
+                deployment=deployment,
+                project_path=project_path,
+                task_id=task_id,
+                task=task,
+                model=model,
+                agent_path=agent,
+                test_method=test_method,
+                save_path=save_path,
+                run_on_host=run_on_host,
+                timeout_ms=timeout_ms,
+                gpu=gpu,
+                interactive=interactive,
+                enable_skill=enable_skill,
+                enable_subagent=enable_subagent,
+                keep_container=keep_container,
+            )
+
+            result['expected_score'] = item.get('expected_score', -1)
+            result['timestamp'] = result.get('timestamp') or datetime.now().isoformat()
+            with open(f'{save_path}/result.jsonl', 'a+', encoding='utf-8') as fw:
+                fw.write(json.dumps(result, ensure_ascii=False) + '\n')
+
+            # When using ae_agent, also write per-task AE report (same as standalone ae-agent).
+            if _is_ae_agent(agent):
+                safe_id = safe_task_id(task_id)
+                log_path = result.get('log_file') or '(log not captured when run via benchmark)'
+                agent_summary = (result.get('agent_run_results') or '')[:8000] or '(No summary captured)'
+                try:
+                    write_task_report(save_path, safe_id, task_id, result, log_path, agent_summary)
+                except Exception as e:
+                    logger.warning('write_task_report failed: %s', e)
+
+    # Write summary.json (total/success counts) when ae_agent was used.
+    if _is_ae_agent(agent):
+        try:
+            compute_and_write_summary(save_path)
+        except Exception as e:
+            logger.warning('compute_and_write_summary failed: %s', e)
+
+    success_count = 0
+    total_count = 0
+    with open(f'{save_path}/result.jsonl', encoding='utf-8') as f:
+        for line in f:
+            result = json.loads(line.strip())
+            if result.get('status') == 'success':
+                success_count += (result.get('score') == result.get('expected_score', -1))
+            total_count += 1
+    logger.info(f'Test run completed: {success_count}/{total_count} tasks succeeded.')
+    summary_data = {'final_score': success_count / total_count, 'total_tasks': total_count}
+
+    with open(os.path.join(save_path, 'avg_score.json'), 'w', encoding='utf-8') as summary_file:
+        json.dump(summary_data, summary_file, indent=4)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='example benchmark')
+    parser.add_argument(
+        '-i',
+        '--input_file',
+        help='Benchmark input file',
+        default='./data/benchmark/arteval_tasks.jsonl',
+        #default='./data/benchmark/env_setup_examples.jsonl',
+    )
+    parser.add_argument('-o', '--save_path', help='Result save path', default=None)
+    parser.add_argument(
+        '-a',
+        '--agent',
+        help='Agent Name',
+        default='claudecode',
+    )
+    parser.add_argument(
+        '-m',
+        '--model_name',
+        help='Model Name',
+        default='claude-sonnet-4-5-20250929',
+    )
+    parser.add_argument(
+        '--interactive',
+        action='store_true',
+        help='Enable interactive mode (continue giving agent instructions after task completes)',
+    )
+    parser.add_argument(
+        '--enable-skill',
+        action='store_true',
+        help='Enable Claude Agent SDK Skill (load from ~/.claude/skills/)',
+    )
+    parser.add_argument(
+        '--enable-subagent',
+        action='store_true',
+        help='Enable Claude Agent SDK Sub-agent (Task tool)',
+    )
+    # Note that if your benchmark has multiple tasks, you need to add --task <task>
+    # in your code to enable task selection.
+    parser.add_argument('-t', '--task', help='specify task in scenarios', default=None)
+
+    args = parser.parse_args()
+
+    model_name = args.model_name
+    agent = args.agent
+    input_file = args.input_file
+    save_path = args.save_path
+    task = args.task
+
+    logger.debug(f"Benchmark path: {input_file}")
+
+    if save_path is None:
+        str_model_name = model_name.replace('/', '_')
+        timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
+        save_path = os.path.join('./outputs', f'env_setup_project__{str_model_name}__{args.agent}__{timestamp}')
+
+    _src_dir = os.path.dirname(os.path.abspath(__file__))
+    if agent == 'claudecode':
+        agent = os.path.join(_src_dir, 'agents', 'claudecode')
+    elif agent == 'claude_sdk':
+        agent = os.path.join(_src_dir, 'agents', 'claude_sdk')
+    elif agent == 'ae_agent' or agent == 'ae-agent':
+        agent = os.path.join(_src_dir, 'agents', 'ae_agent')
+    save_path = os.path.abspath(os.path.expanduser(save_path))
+    os.makedirs(save_path, exist_ok=True)
+
+    main(
+        input_file,
+        model_name,
+        agent,
+        save_path,
+        interactive_default=getattr(args, 'interactive', False),
+        enable_skill_default=getattr(args, 'enable_skill', False),
+        enable_subagent_default=getattr(args, 'enable_subagent', False),
+    )
diff --git a/benchmarks/arteval_bench/src/run_eval_in_env.py b/benchmarks/arteval_bench/src/run_eval_in_env.py
new file mode 100644
index 00000000..afc5a22b
--- /dev/null
+++ b/benchmarks/arteval_bench/src/run_eval_in_env.py
@@ -0,0 +1,845 @@
+"""Patch evaluator for running tests in a deployment."""
+
+import asyncio
+import json
+import os
+import re
+import subprocess
+import sys
+import tempfile
+import shutil
+from pathlib import Path
+
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../../../')))
+
+from swerex.deployment.docker import DockerDeploymentConfig
+from swerex.runtime.abstract import BashAction, Command, CreateBashSessionRequest, UploadRequest
+
+from sdk.logger import logger
+
+
+def _parse_eval_score(output) -> int:
+    """Parse evaluation score from BashObservation or string output.
+
+    - If a line is a single digit (e.g. '4', '0'), use it (prefer last such line).
+    - If output contains 'Agent scores: {...}' (Oracle-style evaluator), count ': 1' as passed items.
+    - Otherwise return 0.
+    """
+    s = (getattr(output, "output", None) or str(output) or "").strip()
+    if not s:
+        return 0
+    lines = s.splitlines()
+    for line in reversed(lines):
+        t = line.strip()
+        if t.isdigit():
+            return int(t)
+    m = re.search(r"Agent scores:\s*\{[^}]*\}", s)
+    if m:
+        return m.group(0).count(": 1")
+    return 0
+
+
+def write_to_file(file_path, content):
+    """Write content to a file."""
+    with open(file_path, 'w') as f:
+        f.write(content)
+
+
+def setup_claude_settings_on_host():
+    """Set up ~/.claude/settings.json with timeout configuration on host."""
+    claude_dir = Path.home() / ".claude"
+    settings_file = claude_dir / "settings.json"
+    
+    claude_dir.mkdir(exist_ok=True)
+    
+    settings = {
+        "env": {
+            "BASH_MAX_TIMEOUT_MS": "345600000",  # 96 hours
+            "BASH_DEFAULT_TIMEOUT_MS": "345600000"
+        }
+    }
+    
+    with open(settings_file, 'w') as f:
+        json.dump(settings, f, indent=2)
+    
+    logger.info(f"Created {settings_file} with 96-hour timeout configuration.")
+
+
+def _is_ae_agent_path(agent_path) -> bool:
+    """True if agent_path points to the ae_agent agent (same flow: agent + evaluation script)."""
+    if not agent_path:
+        return False
+    p = (agent_path or "").rstrip(os.sep)
+    return p.endswith("ae_agent") or os.path.basename(p) == "ae_agent"
+
+
+def _stdin_is_tty() -> bool:
+    """True if stdin is a TTY (required for docker exec -it)."""
+    return getattr(sys.stdin, "isatty", lambda: False)()
+
+
+async def _get_container_id_from_runtime(runtime, deployment) -> str:
+    """Get Docker container ID from inside the container (hostname/cgroup) or from deployment."""
+    container_id = "unknown"
+    try:
+        res = await runtime.run_in_session(
+            BashAction(command='cat /etc/hostname 2>/dev/null || hostname 2>/dev/null || echo "unknown"', timeout=10.0)
+        )
+        container_id = str(getattr(res, "output", "")).strip()
+        try:
+            cgroup_res = await runtime.run_in_session(
+                BashAction(command='cat /proc/self/cgroup 2>/dev/null | grep docker | head -1 | cut -d/ -f3 | cut -c1-12 || echo ""', timeout=10.0)
+            )
+            cid = str(getattr(cgroup_res, "output", "")).strip()
+            if cid:
+                container_id = cid
+        except Exception:
+            pass
+        if hasattr(deployment, '_container_id') and getattr(deployment, '_container_id', None):
+            container_id = deployment._container_id
+        elif hasattr(deployment, 'container_id') and getattr(deployment, 'container_id', None):
+            container_id = deployment.container_id
+    except Exception as e:
+        logger.warning('Failed to get container ID: %s', e)
+    return container_id
+
+
+async def _run_ae_agent_interactive_foreground(
+    container_id: str,
+    model: str,
+    timeout_ms: int | None,
+    enable_skill: bool,
+    enable_subagent: bool,
+):
+    """Run ae_agent runner in foreground via docker exec -it (interactive mode). Returns MockResult with exit_code."""
+    try:
+        from agents.ae_agent.utils import resolve_timeout_ms
+        from agents.ae_agent.run_eval import _docker_exec_env_args
+    except ImportError:
+        _src = os.path.dirname(os.path.abspath(__file__))
+        if _src not in sys.path:
+            sys.path.insert(0, _src)
+        from agents.ae_agent.utils import resolve_timeout_ms
+        from agents.ae_agent.run_eval import _docker_exec_env_args
+
+    timeout_resolved = resolve_timeout_ms(timeout_ms)
+    exec_env = _docker_exec_env_args(
+        timeout_resolved,
+        enable_skill=enable_skill,
+        enable_subagent=enable_subagent,
+    )
+    exec_args = (
+        ['docker', 'exec', '-it']
+        + exec_env
+        + [container_id, 'python3', '-u', '/agent/runner.py', model, '/agent/current_task.txt', '--interactive']
+    )
+    logger.info('Running ae_agent in interactive mode (foreground): docker exec -it %s ...', container_id[:12])
+    proc = await asyncio.to_thread(
+        subprocess.run,
+        exec_args,
+        stdin=sys.stdin,
+        stdout=sys.stdout,
+        stderr=sys.stderr,
+    )
+    exit_code = proc.returncode if proc else -1
+
+    class MockResult:
+        def __init__(self, code, output=''):
+            self.exit_code = code
+            self.output = output or f'exit_code={code}'
+
+    return MockResult(exit_code, f'Interactive session (exit_code={exit_code})')
+
+
+async def run_eval_on_host(
+    project_path,
+    task_id,
+    task,
+    model,
+    agent_path,
+    test_method,
+    save_path,
+    timeout_ms=None,
+    interactive=False,
+    enable_skill=False,
+    enable_subagent=False,
+):
+    """Run evaluation directly on host machine (no Docker container).
+
+    When agent is ae_agent, delegates to ae_agent.run_agent_then_eval (agent run + evaluation script),
+    same flow as claude_sdk. Otherwise uses inline Claude SDK + test_method.
+    """
+    logger.info("=" * 80)
+    logger.info("Running evaluation directly on HOST MACHINE (not in Docker)")
+    logger.info("=" * 80)
+
+    if _is_ae_agent_path(agent_path):
+        logger.info("Using ae_agent flow: run agent then evaluation script.")
+        try:
+            from agents.ae_agent.run_eval import _run_agent_then_eval_async
+        except ImportError:
+            _src = os.path.dirname(os.path.abspath(__file__))
+            if _src not in sys.path:
+                sys.path.insert(0, _src)
+            from agents.ae_agent.run_eval import _run_agent_then_eval_async
+        result = await _run_agent_then_eval_async(
+            project_path=project_path,
+            task_id=task_id,
+            task=task,
+            model=model,
+            test_method=test_method,
+            save_path=save_path,
+            timeout_ms=timeout_ms,
+            skip_prereq_check=False,
+            interactive=interactive,
+            enable_skill=enable_skill,
+            enable_subagent=enable_subagent,
+        )
+        return result
+
+    # Original flow: inline Claude SDK then test_method (e.g. claude_sdk or default)
+    import shutil
+
+    if not shutil.which("docker"):
+        raise RuntimeError("Docker is not installed on host")
+
+    result = subprocess.run(["docker", "ps"], capture_output=True, timeout=10)
+    if result.returncode != 0:
+        raise RuntimeError("Docker is not running on host")
+
+    if not os.environ.get("ANTHROPIC_API_KEY"):
+        raise RuntimeError("ANTHROPIC_API_KEY environment variable is not set")
+
+    setup_claude_settings_on_host()
+
+    project_path = os.path.abspath(project_path)
+    if not os.path.isdir(project_path):
+        raise RuntimeError(f"Project path does not exist: {project_path}")
+
+    logger.info(f"Project path: {project_path}")
+    logger.info(f"Task ID: {task_id}")
+    logger.info(f"Model: {model}")
+
+    try:
+        from claude_agent_sdk import query, ClaudeAgentOptions
+    except ImportError as e:
+        raise RuntimeError(f"claude_agent_sdk not installed: {e}. Install with: pip install claude-agent-sdk")
+
+    system_prompt = f"""You are an experienced software engineer completing an artifact evaluation task.
+
+ENVIRONMENT SETUP (HOST MACHINE - NOT DOCKER):
+- You are running DIRECTLY on the host machine (NOT inside a Docker container)
+- Docker daemon is already running on this host
+- When you use Kind to create Kubernetes clusters, they will be created using the host's Docker
+- This avoids Docker-in-Docker compatibility issues
+- You may need sudo for some operations
+
+ARTIFACT LOCATION:
+- The artifact repository is located at: {project_path}
+- Start by changing to this directory: cd {project_path}
+
+YOUR TASK:
+{task}
+
+TIMEOUT CONFIGURATION (CRITICAL):
+- Long-running commands (builds, tests, Kind cluster creation) are expected
+- DO NOT set short timeouts - let commands complete naturally
+- Kind cluster creation can take 5-10 minutes
+- Full benchmark runs can take hours
+
+IMPORTANT GUIDELINES:
+1. First, cd to {project_path} and examine the directory structure
+2. Follow the README instructions step by step
+3. If you see 'sudo' in instructions, you can use it (or skip if already root)
+4. Use the Bash tool to run commands, Read tool to inspect files
+5. Work systematically through setup, build, and experiment execution
+6. If you encounter errors, debug and resolve them using available tools
+7. For Kind clusters, they will work properly since you're on the host (not DinD)"""
+
+    options = ClaudeAgentOptions(
+        system_prompt=system_prompt,
+        allowed_tools=["Read", "Write", "Bash"],
+        setting_sources=["user"],
+    )
+
+    os.environ['BASH_MAX_TIMEOUT_MS'] = '345600000'
+    os.environ['BASH_DEFAULT_TIMEOUT_MS'] = '345600000'
+
+    logger.info("Starting Claude Agent SDK (Host Mode)...")
+
+    message_count = 0
+    run_results_output = ""
+
+    try:
+        async for message in query(
+            prompt=f"Please start the artifact evaluation task. Begin by changing to the artifact directory at {project_path} and examining its contents.",
+            options=options
+        ):
+            message_count += 1
+            if message_count % 10 == 0:
+                logger.info(f"[Progress] Processed {message_count} messages...")
+            msg_str = str(message)
+            logger.info(msg_str)
+            if 'ResultMessage' in msg_str or 'TextBlock' in msg_str:
+                run_results_output = msg_str
+        logger.info(f"Claude Agent SDK execution completed. Total messages: {message_count}")
+    except Exception as e:
+        logger.error(f"Claude Agent SDK execution failed: {e}")
+        import traceback
+        traceback.print_exc()
+        run_results_output = f"Error: {e}"
+
+    logger.info("Running evaluation script...")
+    try:
+        eval_cmd = f"cd {project_path} && {test_method}"
+        eval_result = subprocess.run(
+            eval_cmd,
+            shell=True,
+            capture_output=True,
+            text=True,
+            timeout=300
+        )
+        test_output = eval_result.stdout.strip()
+        logger.info(f"Evaluation output: {test_output}")
+        result = {
+            'task_id': task_id,
+            'task': task,
+            'project_path': project_path,
+            'agent_run_results': run_results_output,
+            'test_method': test_method,
+            'score': _parse_eval_score(test_output),
+            'status': 'success',
+            'run_on_host': True,
+        }
+    except Exception as e:
+        logger.error(f"Error running test method: {e}")
+        result = {
+            'task_id': task_id,
+            'task': task,
+            'project_path': project_path,
+            'agent_run_results': run_results_output,
+            'test_method': test_method,
+            'score': 0,
+            'status': f'error: {str(e)}',
+            'run_on_host': True,
+        }
+
+    return result
+
+
+async def run_eval_in_env(
+    deployment,
+    project_path,
+    task_id,
+    task,
+    model,
+    agent_path,
+    test_method,
+    save_path,
+    timeout_ms=None,
+    gpu=False,
+    interactive=False,
+    enable_skill=False,
+    enable_subagent=False,
+    keep_container=True,
+):
+    """Spoiler: This function will work with any deployment."""
+    await deployment.start()
+    runtime = deployment.runtime
+
+    # Default 96h when timeout_ms not provided
+    runner_timeout_sec = (timeout_ms / 1000.0) if timeout_ms is not None else 345600.0
+    if hasattr(runtime, "_config"):
+        logger.info(f"Current RemoteRuntime timeout: {runtime._config.timeout}s")
+        runtime._config.timeout = runner_timeout_sec
+        logger.info(f"Overriding RemoteRuntime timeout to {runtime._config.timeout}s")
+
+    # Issue a few one-off commands, similar to `subprocess.run()`
+    logger.info(await runtime.execute(Command(command=['echo', 'Hello, world!'])))
+
+    # Create a bash session
+    await runtime.create_session(CreateBashSessionRequest())
+    # Run a command in the session
+    # The difference to the one-off commands is that environment state persists!
+    logger.info(await runtime.run_in_session(BashAction(command="export MYVAR='test'")))
+    logger.info(await runtime.run_in_session(BashAction(command='echo $MYVAR')))
+
+    logger.info('Uploading project files...')
+    logger.info(
+        await runtime.upload(
+            UploadRequest(
+                source_path=project_path,
+                target_path='/repo',
+            )
+        )
+    )
+    logger.info('Project files uploaded.')
+    
+    # Long-running agents (claude_sdk, ae_agent): remove eval script dirs so the agent cannot see evaluation logic
+    is_claude_sdk = str(agent_path).endswith('claude_sdk')
+    is_ae_agent = str(agent_path).endswith('ae_agent')
+    is_long_running_agent = is_claude_sdk or is_ae_agent
+    agent_label = 'ae_agent' if is_ae_agent else 'claude_sdk'
+    if is_long_running_agent:
+        logger.info(f'Removing _agent_eval directories for {agent_label} to prevent answer leakage...')
+        await runtime.run_in_session(
+            BashAction(command='find /repo -type d -name "_agent_eval" -exec rm -rf {} + 2>/dev/null || true', timeout=30.0)
+        )
+        logger.info('_agent_eval directories removed.')
+    
+    run_results = await runtime.run_in_session(BashAction(command='cd /repo'))
+    logger.info(run_results)
+    run_results = await runtime.run_in_session(BashAction(command='pwd'))
+    logger.info(f'Current directory: {run_results}')
+    run_results = await runtime.run_in_session(BashAction(command='ls'))
+    logger.info(f'Current directory contents: {run_results}')
+
+    logger.info('Uploading agent runner script...')
+    logger.info(
+        await runtime.upload(
+            UploadRequest(
+                source_path=agent_path,
+                target_path='/agent',
+            )
+        )
+    )
+    logger.info(await runtime.run_in_session(BashAction(command='ls /agent/runner.sh')))
+    logger.info('Agent runner script uploaded.')
+
+    logger.info('Setup the agent running environment...')
+    logger.info(await runtime.run_in_session(BashAction(command='chmod +x /agent/runner.sh /agent/install.sh')))
+    logger.info(await runtime.run_in_session(BashAction(command='cat /agent/runner.sh')))
+    logger.info(await runtime.run_in_session(BashAction(command='/agent/install.sh')))
+    
+    # Set required env vars for long-running agents (passed from host into container)
+    if is_long_running_agent:
+        parts = []
+        anthropic_api_key = os.environ.get('ANTHROPIC_API_KEY')
+        foundry_api_key = os.environ.get('ANTHROPIC_FOUNDRY_API_KEY')
+        if anthropic_api_key:
+            escaped_key = anthropic_api_key.replace("'", "'\"'\"'")
+            parts.append(f"export ANTHROPIC_API_KEY='{escaped_key}'")
+        if foundry_api_key:
+            escaped_foundry = foundry_api_key.replace("'", "'\"'\"'")
+            parts.append(f"export ANTHROPIC_FOUNDRY_API_KEY='{escaped_foundry}'")
+            if not anthropic_api_key:
+                parts.append(f"export ANTHROPIC_API_KEY='{escaped_foundry}'")
+        foundry_base = os.environ.get('ANTHROPIC_FOUNDRY_BASE_URL')
+        if foundry_base:
+            escaped_url = foundry_base.replace("'", "'\"'\"'")
+            parts.append(f"export ANTHROPIC_FOUNDRY_BASE_URL='{escaped_url}'")
+        if os.environ.get('CLAUDE_CODE_USE_FOUNDRY') == '1':
+            parts.append("export CLAUDE_CODE_USE_FOUNDRY=1")
+        if enable_skill:
+            parts.append("export AE_ENABLE_SKILL=1")
+        if enable_subagent:
+            parts.append("export AE_ENABLE_SUBAGENT=1")
+        if parts:
+            set_env_cmd = " && ".join(parts)
+            logger.info('Setting Anthropic/Foundry API key and env in container...')
+            logger.info(await runtime.run_in_session(BashAction(command=set_env_cmd)))
+        if not anthropic_api_key and not foundry_api_key:
+            logger.warning('Neither ANTHROPIC_API_KEY nor ANTHROPIC_FOUNDRY_API_KEY found. Runner may fail.')
+
+    # For ae_agent: upload task to /agent/current_task.txt to avoid shell quoting with large tasks
+    if is_ae_agent:
+        tmpdir = tempfile.mkdtemp(prefix='ae_agent_task_')
+        try:
+            task_file_host = os.path.join(tmpdir, 'current_task.txt')
+            with open(task_file_host, 'w', encoding='utf-8') as f:
+                f.write(task)
+            await runtime.upload(UploadRequest(source_path=tmpdir, target_path='/agent_task_file'))
+            await runtime.run_in_session(BashAction(command='cp /agent_task_file/current_task.txt /agent/current_task.txt', timeout=10.0))
+        finally:
+            shutil.rmtree(tmpdir, ignore_errors=True)
+        logger.info('Task file uploaded to /agent/current_task.txt for ae_agent.')
+
+    logger.info('Running runner script...')
+    if timeout_ms is not None:
+        runner_timeout = timeout_ms / 1000.0
+    else:
+        runner_timeout = 345600.0 if is_long_running_agent else 1200.0  # 96h for long-running agents
+
+    run_results = None
+    # Docker + interactive: run ae_agent in foreground via docker exec -it (same as standalone ae-agent).
+    if is_ae_agent and interactive and _stdin_is_tty():
+        container_id_early = await _get_container_id_from_runtime(runtime, deployment)
+        if container_id_early and container_id_early != "unknown":
+            try:
+                run_results = await _run_ae_agent_interactive_foreground(
+                    container_id_early, model, timeout_ms, enable_skill, enable_subagent
+                )
+                logger.info('ae_agent interactive session finished with exit_code=%s', run_results.exit_code)
+            except Exception as e:
+                logger.warning('ae_agent interactive foreground failed: %s', e)
+        else:
+            logger.warning('Cannot get container ID for interactive mode; falling back to non-interactive.')
+
+    if run_results is None:
+        if is_long_running_agent:
+            # Live log monitoring: run runner in background, poll log file periodically
+            await runtime.run_in_session(BashAction(command='rm -f /agent/runner.live.log && touch /agent/runner.live.log', timeout=10.0))
+
+            # ae_agent: use task file to avoid shell quoting; others pass task string
+            if is_ae_agent:
+                start_cmd = (
+                    'stdbuf -oL -eL /agent/runner.sh "' + model + '" /agent/current_task.txt > /agent/runner.live.log 2>&1 & '
+                    'RUNNER_PID=$!; '
+                    'sleep 1; '
+                    'echo RUNNER_PID=$RUNNER_PID'
+                )
+            else:
+                start_cmd = (
+                    f'bash -c "stdbuf -oL -eL /agent/runner.sh \\"{model}\\" \\"{task}\\" > /agent/runner.live.log 2>&1 & '
+                    'RUNNER_PID=$!; '
+                    'sleep 1; '
+                    'echo RUNNER_PID=$RUNNER_PID"'
+                )
+            start_res = await runtime.run_in_session(BashAction(command=start_cmd, timeout=30.0))
+            start_output = str(getattr(start_res, "output", "")).strip()
+
+            pid = None
+            for line in start_output.split('\n'):
+                if 'RUNNER_PID=' in line:
+                    pid = line.split('RUNNER_PID=', 1)[1].strip()
+                    break
+
+            if not pid or not pid.isdigit():
+                # Fallback: find PID by process name after short delay
+                await asyncio.sleep(2)
+                ps_res = await runtime.run_in_session(
+                    BashAction(command="ps aux | grep '[r]unner.py' | awk '{print $2}' | head -1", timeout=10.0)
+                )
+                pid = str(getattr(ps_res, "output", "")).strip()
+
+            logger.info(f'{agent_label} runner started with pid: {pid}')
+
+            await asyncio.sleep(2)  # Allow log file to have content
+
+            elapsed = 0.0
+            poll_interval = 10.0  # Poll every 10s for live log
+            run_results = None
+            last_log_content = ""  # Track last read content to avoid duplicate output
+
+            while elapsed < runner_timeout:
+                try:
+                    log_res = await runtime.run_in_session(
+                        BashAction(command='cat /agent/runner.live.log 2>/dev/null || echo ""', timeout=30.0)
+                    )
+                    current_log_content = str(getattr(log_res, "output", "")).strip()
+
+                    if current_log_content and current_log_content != last_log_content:
+                        if last_log_content and current_log_content.startswith(last_log_content):
+                            new_content = current_log_content[len(last_log_content):].strip()
+                            if new_content:
+                                logger.info(f'[{agent_label} live log @ {elapsed:.0f}s ({elapsed/60:.1f} min)]\n{new_content}')
+                        else:
+                            logger.info(f'[{agent_label} live log @ {elapsed:.0f}s ({elapsed/60:.1f} min)]\n{current_log_content}')
+                        last_log_content = current_log_content
+                    elif elapsed % 300 == 0 and elapsed > 0:
+                        logger.info(f'[{agent_label} still running @ {elapsed:.0f}s ({elapsed/60:.1f} min), no new output]')
+                except Exception as e:
+                    logger.info(f'Failed to read {agent_label} live log: {e}')
+
+                if pid and pid.isdigit():
+                    ps_res = await runtime.run_in_session(
+                        BashAction(command=f'ps -p {pid} >/dev/null 2>&1; echo $?', timeout=10.0)
+                    )
+                    ps_code = str(getattr(ps_res, "output", "")).strip()
+                    if ps_code != "0":
+                        wait_res = await runtime.run_in_session(
+                            BashAction(command=f'wait {pid} 2>/dev/null; echo $?', timeout=30.0)
+                        )
+                        exit_code_str = str(getattr(wait_res, "output", "")).strip()
+
+                        class MockResult:
+                            def __init__(self, code):
+                                self.exit_code = int(code) if code.isdigit() else 0
+                                self.output = f'exit_code={self.exit_code}'
+                        run_results = MockResult(exit_code_str)
+                        logger.info(f'{agent_label} runner finished with exit code: {run_results.exit_code}')
+                        break
+                else:
+                    ps_res = await runtime.run_in_session(
+                        BashAction(command="ps aux | grep '[r]unner.py' | wc -l", timeout=10.0)
+                    )
+                    proc_count = str(getattr(ps_res, "output", "")).strip()
+                    if proc_count == "0" or not proc_count.isdigit() or int(proc_count) == 0:
+                        logger.info(f'{agent_label} runner process not found, assuming finished')
+                        class MockResult:
+                            def __init__(self):
+                                self.exit_code = 0
+                                self.output = 'exit_code=0'
+                        run_results = MockResult()
+                        break
+
+                await asyncio.sleep(poll_interval)
+                elapsed += poll_interval
+
+            if run_results is None:
+                # Timeout: try to kill process and capture final log
+                if pid and pid.isdigit():
+                    try:
+                        await runtime.run_in_session(BashAction(command=f'kill -TERM {pid} 2>/dev/null || kill -9 {pid} 2>/dev/null || true', timeout=10.0))
+                    except Exception:
+                        pass
+                try:
+                    tail_log = await runtime.run_in_session(
+                        BashAction(command='tail -n 200 /agent/runner.live.log', timeout=30.0)
+                    )
+                    logger.info(f'{agent_label} live log tail (on timeout):\n{tail_log}')
+                except Exception as e:
+                    logger.info(f'Failed to read {agent_label} live log after timeout: {e}')
+                raise TimeoutError(f'{agent_label} runner exceeded timeout {runner_timeout}s')
+
+        else:
+            runner_cmd = f'/agent/runner.sh "{model}" "{task}"'
+            run_results = await runtime.run_in_session(BashAction(command=runner_cmd, timeout=runner_timeout))
+    logger.info(f"agent's run results: {run_results}")
+    logger.info('Runner script finished.')
+
+    # For long-running agents: upload eval scripts before running evaluation
+    if is_long_running_agent:
+        logger.info(f'Uploading _agent_eval directories for evaluation ({agent_label})...')
+        eval_dirs = []
+        for root, dirs, files in os.walk(project_path):
+            if '_agent_eval' in dirs:
+                eval_source_path = os.path.join(root, '_agent_eval')
+                rel_path = os.path.relpath(eval_source_path, project_path)
+                eval_dirs.append((eval_source_path, rel_path))
+
+        if eval_dirs:
+            for eval_source_path, rel_path in eval_dirs:
+                target_eval_path = os.path.join('/repo', rel_path)
+                logger.info(f'Uploading _agent_eval from {eval_source_path} to {target_eval_path}')
+                try:
+                    await runtime.upload(
+                        UploadRequest(
+                            source_path=eval_source_path,
+                            target_path=target_eval_path,
+                        )
+                    )
+                    logger.info(f'_agent_eval directory uploaded: {rel_path}')
+                except Exception as e:
+                    logger.warning(f'Failed to upload _agent_eval from {eval_source_path}: {e}')
+            logger.info('All _agent_eval directories uploaded for evaluation.')
+        else:
+            logger.warning(f'No _agent_eval directories found in {project_path}')
+
+    # Run evaluator: JSONL evaluator is a path to main.py (e.g. sosp23_acto/_agent_eval/main.py);
+    # must run from /repo with `python <path>` so the script is executed correctly.
+    if test_method.strip().endswith('.py'):
+        eval_cmd = f"cd /repo && python {test_method.strip()}"
+    else:
+        eval_cmd = f"cd /repo && {test_method}"
+    try:
+        test_output = await runtime.run_in_session(BashAction(command=eval_cmd))
+        logger.info(test_output)
+        result = {
+            'task': task,
+            'project_path': project_path,
+            'agent_run_results': run_results.output if hasattr(run_results, 'output') else str(run_results),
+            'test_method': test_method,
+            'score': _parse_eval_score(test_output),
+            'status': 'success',
+        }
+    except Exception as e:
+        logger.info(f'Error running test method: {e}')
+        result = {
+            'task': task,
+            'project_path': project_path,
+            'agent_run_results': run_results.output if hasattr(run_results, 'output') else str(run_results),
+            'test_method': test_method,
+            'score': 0,
+            'status': f'error: {str(e)}',
+        }
+
+    # For long-running agents: sync+stop (when keep_container=False) or keep container for inspection
+    if is_long_running_agent:
+        container_id = await _get_container_id_from_runtime(runtime, deployment)
+        container_name = (
+            getattr(deployment, '_container_name', None)
+            or getattr(deployment, 'container_name', None)
+            or 'unknown'
+        )
+
+        if is_ae_agent and not keep_container and container_id and container_id != "unknown":
+            # Original artifact-agent behavior: sync workspace, commit image, stop container
+            try:
+                from agents.ae_agent.run_eval import save_container_after_run
+                saved_image, container_stopped = save_container_after_run(container_id, project_path, task_id)
+                result['saved_image'] = saved_image
+                result['container_stopped'] = container_stopped
+                result['container_id'] = container_id
+                result['container_kept'] = False
+                logger.info(f'ae_agent: synced workspace, saved image={saved_image}, stopped={container_stopped}')
+            except Exception as e:
+                logger.warning(f'save_container_after_run failed: {e}')
+                result['container_id'] = container_id
+                result['container_kept'] = True
+            try:
+                await deployment.stop()
+            except Exception as e:
+                logger.warning(f'deployment.stop() failed: {e}')
+        elif keep_container:
+            logger.info('=' * 80)
+            logger.info(f'Keeping Docker container running for {agent_label} (for debugging purposes).')
+            logger.info(f'Container ID: {container_id}')
+            logger.info(f'Task ID: {task_id}')
+            logger.info(f'Project Path: {project_path}')
+            logger.info(f'  To inspect: docker exec -it {container_id} /bin/bash')
+            logger.info(f'  To stop: docker stop {container_id}')
+            logger.info('=' * 80)
+            result['container_id'] = container_id
+            result['container_name'] = container_name
+            result['container_kept'] = True
+        else:
+            await deployment.stop()
+            result['container_id'] = container_id
+            result['container_kept'] = False
+    else:
+        await deployment.stop()
+        result['container_kept'] = False
+
+    
+    return result
+
+
+def run_eval(
+    deployment,
+    project_path,
+    task_id,
+    task,
+    model,
+    agent_path,
+    test_method,
+    save_path,
+    run_on_host=False,
+    timeout_ms=None,
+    gpu=False,
+    interactive=False,
+    enable_skill=False,
+    enable_subagent=False,
+    keep_container=True,
+):
+    """Run evaluation either on host or in Docker container.
+
+    Args:
+        deployment: Docker image to use (ignored if run_on_host=True)
+        project_path: Path to the artifact project
+        task_id: Task identifier
+        task: Task description
+        model: Model name
+        agent_path: Path to agent scripts
+        test_method: Evaluation command
+        save_path: Path to save results
+        run_on_host: If True, run directly on host machine instead of Docker
+        timeout_ms: Per-task timeout in milliseconds (None = default 96h for long-running agents)
+        gpu: If True, pass --gpus all to Docker (Docker mode only)
+        interactive: If True, enable interactive mode after task (ae_agent only)
+        enable_skill: If True, enable Claude Agent SDK Skill (ae_agent only)
+        enable_subagent: If True, enable Claude Agent SDK Sub-agent (ae_agent only)
+        keep_container: If False and ae_agent, sync workspace + commit image + stop container after run
+    """
+
+    if run_on_host:
+        logger.info(f"Task {task_id} configured to run on HOST machine (run_on_host=True)")
+        return asyncio.run(
+            run_eval_on_host(
+                project_path,
+                task_id,
+                task,
+                model,
+                agent_path,
+                test_method,
+                save_path,
+                timeout_ms=timeout_ms,
+                interactive=interactive,
+                enable_skill=enable_skill,
+                enable_subagent=enable_subagent,
+            )
+        )
+
+    # Run in Docker container
+    image = deployment or 'bastoica/ae-agent-ubuntu24.04:latest'
+
+    docker_args = [
+        '--privileged',
+        '--cgroupns=host',
+        '-e', 'KIND_EXPERIMENTAL_CONTAINERD_SNAPSHOTTER=native',
+    ]
+    if gpu:
+        docker_args.extend(['--gpus', 'all'])
+
+    config = DockerDeploymentConfig(
+        image=image,
+        startup_timeout=1200.0,
+        docker_args=docker_args,
+    )
+    deployment_obj = config.get_deployment()
+
+    return asyncio.run(
+        run_eval_in_env(
+            deployment_obj,
+            project_path,
+            task_id,
+            task,
+            model,
+            agent_path,
+            test_method,
+            save_path,
+            timeout_ms=timeout_ms,
+            gpu=gpu,
+            interactive=interactive,
+            enable_skill=enable_skill,
+            enable_subagent=enable_subagent,
+            keep_container=keep_container,
+        )
+    )
+
+
+
+def test():
+    task = 'The java is not installed. Can you please setup it? Note: you are in a docker with root permission. DO NOT use sudo.'
+    project_path = '../data/benchmark/projects/test-repo'
+    test_method = 'java -version'
+    deployment = 'xuafeng/swe-go-python:latest'
+    model = 'claude-sonnet-4-5-20250929'
+    agent_path = './agents/claudecode'
+    save_path = './eval_results'
+    task_id = 'test_task_1'
+    result = run_eval(deployment, project_path, task_id, task, model, agent_path, test_method, save_path)
+    print('Test result:', result)
+
+
+# TODO: still work on add openhand agent
+def test1():
+    task = 'The java is not installed. Can you please setup it? Note: you are in a docker with root permission. DO NOT use sudo.'
+    project_path = '../data/benchmark/projects/test-repo'
+    test_method = 'java -version'
+    deployment = 'xuafeng/swe-go-python:latest'
+    model = 'claude-sonnet-4-5-20250929'
+    agent_path = './agents/openhand'
+    save_path = './eval_results'
+    task_id = 'test_task_1'
+    result = run_eval(deployment, project_path, task_id, task, model, agent_path, test_method, save_path)
+    print('Test result:', result)
+
+
+def test2():
+    task = "create a python file named hello.py that prints 'hello world'"
+    project_path = '../data/benchmark/projects/test-repo'
+    test_method = 'python hello.py'
+    deployment = 'xuafeng/swe-go-python:latest'
+    model = 'claude-sonnet-4-5-20250929'
+    agent_path = './agents/claudecode'
+    save_path = './eval_results'
+    task_id = 'test_task_1'
+    eval_out = asyncio.run(
+        run_eval_in_env(deployment, project_path, task_id, task, model, agent_path, test_method, save_path)
+    )
+    print(eval_out)
+
+
+if __name__ == '__main__':
+    test1()
diff --git a/benchmarks/arteval_bench/src/utils.py b/benchmarks/arteval_bench/src/utils.py
new file mode 100644
index 00000000..56bc657f
--- /dev/null
+++ b/benchmarks/arteval_bench/src/utils.py
@@ -0,0 +1,4 @@
+"""Re-export get_task for main.py when run from benchmark root (python src/main.py)."""
+from core.utils import get_task
+
+__all__ = ["get_task"]
diff --git a/sdk/utils.py b/sdk/utils.py
index cbd79357..995fdfaf 100644
--- a/sdk/utils.py
+++ b/sdk/utils.py
@@ -62,22 +62,37 @@ def set_llm_endpoint_from_config(config_path):
                 logger.warning('  - %s', key)
             logger.warning('Only [evaluator_api_keys] values will be used for both evaluator and model under test.')
 
-    # First, set environment variables from [llm]
+    # Placeholder values that should not override an existing env var (e.g. from export)
+    _placeholders = frozenset({'', 'xxx', 'sk-xxxx', 'sk-xxx', 'xxx'})
+
+    def _is_placeholder(val):
+        if val is None:
+            return True
+        s = str(val).strip().lower()
+        return not s or s in _placeholders or s.startswith('sk-xxx')
+
+    # First, set environment variables from [llm] (do not overwrite existing non-placeholder env)
     logger.info('Setting the following environment variables from [llm]:')
     for key, value in llm_config.items():
+        if _is_placeholder(value) and os.environ.get(key) and not _is_placeholder(os.environ.get(key)):
+            logger.info('%s: (keeping existing env)', key)
+            continue
         logger.info('%s', f'{key}: [REDACTED]' if 'key' in key.lower() else f'{key}: {value}')
-        os.environ[key] = value
+        os.environ[key] = str(value)
         # add exception for SWE-Agent:
         if key == 'AZURE_API_KEY':
-            os.environ['AZURE_OPENAI_API_KEY'] = value
+            os.environ['AZURE_OPENAI_API_KEY'] = str(value)
             logger.info('AZURE_OPENAI_API_KEY: [REDACTED]')
 
     # Then, set environment variables from [evaluator_api_keys] (will override [llm] if conflict)
     logger.info('Setting the following environment variables from [evaluator_api_keys]:')
     for key, value in evaluator_config.items():
+        if _is_placeholder(value) and os.environ.get(key) and not _is_placeholder(os.environ.get(key)):
+            logger.info('%s: (keeping existing env)', key)
+            continue
         logger.info('%s', f'{key}: [REDACTED]' if 'key' in key.lower() else f'{key}: {value}')
-        os.environ[key] = value
+        os.environ[key] = str(value)
         # add exception for SWE-Agent:
         if key == 'AZURE_API_KEY':
-            os.environ['AZURE_OPENAI_API_KEY'] = value
+            os.environ['AZURE_OPENAI_API_KEY'] = str(value)
             logger.info('AZURE_OPENAI_API_KEY: [REDACTED]')