From 62b0642a8d50a8e9a21a5ca4dfdc8eef156909e2 Mon Sep 17 00:00:00 2001
From: Bogdan 'Bo' Stoica <bastoica@illinois.edu>
Date: Thu, 12 Feb 2026 10:09:18 +0000
Subject: [PATCH 1/7] Integrate ae-agent as long-running agent for ArtEvalBench

---
 .../data/benchmark/README_ae_agent.md         |  38 +
 .../data/benchmark/run_ae_agent.sh            |  16 +
 .../src/agents/ae_agent/README.md             |  42 ++
 .../src/agents/ae_agent/__init__.py           |   4 +
 .../src/agents/ae_agent/install.sh            |  22 +
 .../src/agents/ae_agent/interactive_runner.py | 105 +++
 .../src/agents/ae_agent/runner.py             | 248 +++++++
 .../src/agents/ae_agent/runner.sh             |  23 +
 .../src/agents/ae_agent/utils.py              |   4 +
 benchmarks/arteval_bench/src/main.py          | 125 ++++
 .../arteval_bench/src/run_eval_in_env.py      | 650 ++++++++++++++++++
 11 files changed, 1277 insertions(+)
 create mode 100644 benchmarks/arteval_bench/data/benchmark/README_ae_agent.md
 create mode 100755 benchmarks/arteval_bench/data/benchmark/run_ae_agent.sh
 create mode 100644 benchmarks/arteval_bench/src/agents/ae_agent/README.md
 create mode 100644 benchmarks/arteval_bench/src/agents/ae_agent/__init__.py
 create mode 100644 benchmarks/arteval_bench/src/agents/ae_agent/install.sh
 create mode 100644 benchmarks/arteval_bench/src/agents/ae_agent/interactive_runner.py
 create mode 100644 benchmarks/arteval_bench/src/agents/ae_agent/runner.py
 create mode 100644 benchmarks/arteval_bench/src/agents/ae_agent/runner.sh
 create mode 100644 benchmarks/arteval_bench/src/agents/ae_agent/utils.py
 create mode 100644 benchmarks/arteval_bench/src/main.py
 create mode 100644 benchmarks/arteval_bench/src/run_eval_in_env.py

diff --git a/benchmarks/arteval_bench/data/benchmark/README_ae_agent.md b/benchmarks/arteval_bench/data/benchmark/README_ae_agent.md
new file mode 100644
index 00000000..40d39473
--- /dev/null
+++ b/benchmarks/arteval_bench/data/benchmark/README_ae_agent.md
@@ -0,0 +1,38 @@
+# Run ArtEval Benchmark with AE Agent
+
+This directory contains `arteval_tasks.jsonl` and other benchmark task definitions. To run the benchmark with **ae_agent**, start from the **benchmark root** (`benchmarks/arteval_bench/`).
+
+## Run from benchmark root
+
+```bash
+cd benchmarks/arteval_bench
+
+# Use ae_agent with data/benchmark/arteval_tasks.jsonl as input
+python src/main.py \
+  -i ./data/benchmark/arteval_tasks.jsonl \
+  -a ae_agent \
+  -m claude-sonnet-4-5-20250929 \
+  -o ./outputs/ae_agent_$(date +%Y-%m-%d_%H-%M-%S)
+```
+
+Or, if `run.sh` supports passing an agent argument:
+
+```bash
+cd benchmarks/arteval_bench
+./run.sh claude-sonnet-4-5-20250929 ae_agent
+```
+
+## Environment
+
+- Set `ANTHROPIC_API_KEY` or `ANTHROPIC_FOUNDRY_API_KEY`.
+- Optional: `ANTHROPIC_FOUNDRY_BASE_URL`, `CLAUDE_CODE_USE_FOUNDRY=1`.
+- The ae_agent implementation lives under `src/agents/ae_agent/`, synced with the standalone ae-agent repo (runner, install, utils, interactive_runner).
+
+## Task format
+
+Each line of `arteval_tasks.jsonl` is one JSON object, including at least:
+
+- `artifact_id`, `artifact_dir`, `artifact_readme`, `artifact_url`
+- `evaluator`: evaluation command (e.g. `cd /repo && python3 _agent_eval/main.py`)
+- `docker_env`: Docker image
+- `run_on_host`: when `true`, run on the host instead of Docker
diff --git a/benchmarks/arteval_bench/data/benchmark/run_ae_agent.sh b/benchmarks/arteval_bench/data/benchmark/run_ae_agent.sh
new file mode 100755
index 00000000..541f581a
--- /dev/null
+++ b/benchmarks/arteval_bench/data/benchmark/run_ae_agent.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+# Run ArtEval benchmark with ae_agent. Execute this script from the benchmark root.
+# Usage: ./run_ae_agent.sh [optional: model name, default claude-sonnet-4-5-20250929]
+
+set -e
+BENCH_ROOT="$(cd "$(dirname "$0")/../.." && pwd)"
+MODEL_NAME="${1:-claude-sonnet-4-5-20250929}"
+cd "$BENCH_ROOT"
+echo "==> ArtEval benchmark root: $BENCH_ROOT"
+echo "==> Model: $MODEL_NAME"
+echo "==> Agent: ae_agent"
+python src/main.py \
+  -i ./data/benchmark/arteval_tasks.jsonl \
+  -a ae_agent \
+  -m "$MODEL_NAME" \
+  -o "./outputs/ae_agent_${MODEL_NAME//\//_}_$(date +%Y-%m-%d_%H-%M-%S)"
diff --git a/benchmarks/arteval_bench/src/agents/ae_agent/README.md b/benchmarks/arteval_bench/src/agents/ae_agent/README.md
new file mode 100644
index 00000000..e1a9c4f2
--- /dev/null
+++ b/benchmarks/arteval_bench/src/agents/ae_agent/README.md
@@ -0,0 +1,42 @@
+# AE Agent (ArtEval sub-agent)
+
+This agent is the **ae-agent** logic integrated as a sub-agent of the system-intelligence-benchmark ArtEval benchmark. It uses the Claude Agent SDK to run artifact evaluation tasks inside the benchmark container. Code is synced from the standalone [ae-agent](https://github.com/Couen/ae-agent) repo.
+
+## Files (synced from ae-agent)
+
+- **install.sh**: Installs `claude-agent-sdk==0.1.24` and configures `~/.claude/settings.json` (48h Bash timeout).
+- **runner.sh**: Entry point invoked as `runner.sh <model> <task_or_path>`. Forwards to `runner.py`. Uses `/agent/current_task.txt` when the benchmark passes task via file.
+- **runner.py**: Runs the task with Claude Agent SDK; supports rate-limit retry (429), message_formatter; second argument can be task text or path to file.
+- **utils.py**: `DEFAULT_TIMEOUT_MS` for the runner.
+- **interactive_runner.py**: Interactive multi-turn session inside container (e.g. `docker exec -it <cid> python3 /agent/interactive_runner.py <model>`).
+- **__init__.py**: Package marker.
+
+## Usage from the benchmark
+
+From the benchmark root (`benchmarks/arteval_bench/`):
+
+```bash
+python src/main.py -i ./data/benchmark/arteval_tasks.jsonl -a ae_agent -m claude-sonnet-4-5-20250929 -o ./outputs/ae_agent_run
+```
+
+Or use the helper script from `data/benchmark/`:
+
+```bash
+./data/benchmark/run_ae_agent.sh [model_name]
+```
+
+The benchmark will:
+
+1. Upload the agent to `/agent` in the container.
+2. For ae_agent: upload task to `/agent/current_task.txt`, then run `runner.sh "$model" /agent/current_task.txt` (avoids shell quoting with large tasks).
+3. Use long-running and live-log behavior (48h timeout, live log streaming, `_agent_eval` removal before run and re-upload before evaluation, container kept for debugging).
+4. Pass through `ANTHROPIC_API_KEY`, `ANTHROPIC_FOUNDRY_API_KEY`, `ANTHROPIC_FOUNDRY_BASE_URL`, `CLAUDE_CODE_USE_FOUNDRY` when set.
+
+## Dependencies
+
+- Python 3 with `claude-agent-sdk` (installed by `install.sh`).
+- Optional: `message_formatter` for prettier output (if present in the environment).
+
+## Relation to standalone ae-agent repo
+
+The standalone ae-agent repo provides a full CLI (`main.py`, `run_eval.py`, `utils.py`) and host/Docker orchestration. This sub-agent is the in-container runner only; the benchmark’s `run_eval_in_env.py` handles orchestration, task file upload, and Foundry env vars.
diff --git a/benchmarks/arteval_bench/src/agents/ae_agent/__init__.py b/benchmarks/arteval_bench/src/agents/ae_agent/__init__.py
new file mode 100644
index 00000000..e5bc1f34
--- /dev/null
+++ b/benchmarks/arteval_bench/src/agents/ae_agent/__init__.py
@@ -0,0 +1,4 @@
+"""AE Agent for ArtEvalBench - Claude Agent SDK runner for artifact evaluation tasks.
+
+Contract: artifact at /repo, this agent at /agent; task passed as CLI arg or path to file (/agent/current_task.txt).
+"""
diff --git a/benchmarks/arteval_bench/src/agents/ae_agent/install.sh b/benchmarks/arteval_bench/src/agents/ae_agent/install.sh
new file mode 100644
index 00000000..8a498c3a
--- /dev/null
+++ b/benchmarks/arteval_bench/src/agents/ae_agent/install.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+# Setup AE Agent environment inside benchmark container.
+# Ensures claude-agent-sdk is available so runner.py can run.
+set -e
+if ! python3 -c "import claude_agent_sdk" 2>/dev/null; then
+  echo "Installing claude-agent-sdk..."
+  pip3 install claude-agent-sdk==0.1.24 || pip3 install --break-system-packages claude-agent-sdk==0.1.24 || true
+  if ! python3 -c "import claude_agent_sdk"; then
+    echo "WARNING: claude_agent_sdk still not importable; runner may fail."
+  fi
+fi
+# 48h Bash timeout for long-running artifact tasks
+mkdir -p ~/.claude
+cat > ~/.claude/settings.json << 'EOF'
+{
+  "env": {
+    "BASH_MAX_TIMEOUT_MS": "172800000",
+    "BASH_DEFAULT_TIMEOUT_MS": "172800000"
+  }
+}
+EOF
+echo "AE Agent environment ready (~/.claude/settings.json configured)."
diff --git a/benchmarks/arteval_bench/src/agents/ae_agent/interactive_runner.py b/benchmarks/arteval_bench/src/agents/ae_agent/interactive_runner.py
new file mode 100644
index 00000000..93e3e2cd
--- /dev/null
+++ b/benchmarks/arteval_bench/src/agents/ae_agent/interactive_runner.py
@@ -0,0 +1,105 @@
+#!/usr/bin/env python3
+"""Interactive runner for AE Agent - runs inside container after main task.
+
+Used when interactive=True: docker exec -it <container_id> python3 /agent/interactive_runner.py <model>
+Artifact at /repo; API keys from container env.
+"""
+
+import asyncio
+import os
+import sys
+
+sys.path.insert(0, '/agent')
+
+try:
+    from utils import DEFAULT_TIMEOUT_MS
+except ImportError:
+    DEFAULT_TIMEOUT_MS = 172_800_000
+
+try:
+    from claude_agent_sdk import ClaudeAgentOptions, ClaudeSDKClient
+except ImportError as e:
+    print(f"ERROR: claude_agent_sdk not available: {e}", file=sys.stderr)
+    sys.exit(1)
+
+
+def _build_system_prompt() -> str:
+    try:
+        timeout_ms_env = os.environ.get("BASH_MAX_TIMEOUT_MS")
+        timeout_ms = int(timeout_ms_env) if timeout_ms_env else DEFAULT_TIMEOUT_MS
+    except ValueError:
+        timeout_ms = DEFAULT_TIMEOUT_MS
+
+    return """You are an experienced software engineer in an interactive session.
+
+ENVIRONMENT:
+- You are inside a Docker container with root permissions.
+- The artifact repository is at /repo. Change to it: cd /repo
+- You have access to Read, Write, and Bash tools.
+
+TIMEOUT: Long-running commands can take hours; do not set short timeouts.
+
+You will receive follow-up instructions from the user. Complete each one and respond.
+If the user asks to stop or says 'quit'/'exit', acknowledge and they will end the session."""
+
+
+def _display_message(msg) -> None:
+    if hasattr(msg, 'content'):
+        for block in msg.content:
+            if hasattr(block, 'text'):
+                print(block.text, end='', flush=True)
+    print(flush=True)
+
+
+async def _interactive_loop(model_name: str) -> int:
+    options = ClaudeAgentOptions(
+        system_prompt=_build_system_prompt(),
+        allowed_tools=["Read", "Write", "Bash"],
+        setting_sources=["user"],
+    )
+
+    print("\n" + "=" * 60, flush=True)
+    print("Interactive mode - Agent ready. Type your instructions (or 'quit'/'exit' to end).", flush=True)
+    print("=" * 60 + "\n", flush=True)
+
+    async with ClaudeSDKClient(options=options) as client:
+        await client.query(
+            "Please confirm you are in /repo and ready for the user's follow-up instructions. Reply briefly that you are ready."
+        )
+        async for msg in client.receive_response():
+            _display_message(msg)
+
+        while True:
+            try:
+                user_input = input("\n>>> ").strip()
+            except (EOFError, KeyboardInterrupt):
+                print("\nExiting interactive mode.", flush=True)
+                return 0
+
+            if not user_input:
+                continue
+            if user_input.lower() in ('quit', 'exit', 'q'):
+                print("Exiting interactive mode.", flush=True)
+                return 0
+
+            await client.query(user_input)
+            async for msg in client.receive_response():
+                _display_message(msg)
+
+    return 0
+
+
+def main() -> int:
+    model_name = os.environ.get("AE_AGENT_MODEL", "claude-sonnet-4-5-20250929")
+    if len(sys.argv) >= 2:
+        model_name = sys.argv[1]
+
+    if not os.environ.get('ANTHROPIC_API_KEY') and not os.environ.get('ANTHROPIC_FOUNDRY_API_KEY'):
+        print("ERROR: ANTHROPIC_API_KEY or ANTHROPIC_FOUNDRY_API_KEY must be set.", file=sys.stderr)
+        return 1
+
+    return asyncio.run(_interactive_loop(model_name))
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/benchmarks/arteval_bench/src/agents/ae_agent/runner.py b/benchmarks/arteval_bench/src/agents/ae_agent/runner.py
new file mode 100644
index 00000000..44fcea3b
--- /dev/null
+++ b/benchmarks/arteval_bench/src/agents/ae_agent/runner.py
@@ -0,0 +1,248 @@
+#!/usr/bin/env python3
+"""AE Agent runner for ArtEvalBench - Claude Agent SDK for artifact tasks.
+
+Runs inside benchmark container: artifact at /repo, agent at /agent; task as CLI arg or path to file.
+"""
+
+import asyncio
+import os
+import sys
+
+sys.path.insert(0, '/agent')
+
+try:
+    from utils import DEFAULT_TIMEOUT_MS
+except ImportError:
+    DEFAULT_TIMEOUT_MS = 172_800_000  # 48h fallback
+
+try:
+    from claude_agent_sdk import query, ClaudeAgentOptions
+    CLAUDE_SDK_AVAILABLE = True
+except ImportError as e:
+    print(f"ERROR: Failed to import claude_agent_sdk: {e}", file=sys.stderr)
+    CLAUDE_SDK_AVAILABLE = False
+
+try:
+    from message_formatter import MessageFormatter
+    FORMATTER_AVAILABLE = True
+except ImportError:
+    print("WARNING: message_formatter not available, will use basic output.", file=sys.stderr)
+    FORMATTER_AVAILABLE = False
+
+if not CLAUDE_SDK_AVAILABLE:
+    print("ERROR: claude_agent_sdk is not available.", file=sys.stderr)
+    sys.exit(1)
+
+RATE_LIMIT_MAX_RETRIES = 5
+RATE_LIMIT_WAIT_SEC = 60
+RATE_LIMIT_WAIT_MAX_SEC = 600
+RATE_LIMIT_WRAPPED_MAX_RETRIES = 3
+
+
+def _is_rate_limit_error(exc: BaseException) -> bool:
+    msg = str(exc).lower()
+    return "429" in msg or "rate limit" in msg or "ratelimitreached" in msg
+
+
+def _is_possible_wrapped_rate_limit(exc: BaseException) -> bool:
+    msg = str(exc)
+    return ("command failed" in msg.lower() and "exit code 1" in msg.lower()) or "check stderr" in msg.lower()
+
+
+def _parse_retry_after_seconds(exc: BaseException) -> int | None:
+    import re
+    m = re.search(r"wait\s+(\d+)\s*seconds", str(exc), re.I)
+    return int(m.group(1)) if m else None
+
+
+def get_default_work_dir():
+    return os.environ.get('WORK_DIR', None)
+
+
+async def run_agent(model_name: str, task_description: str):
+    work_dir_hint = get_default_work_dir()
+    work_dir_instruction = f"- You may start by checking: {work_dir_hint}\n" if work_dir_hint else ""
+
+    try:
+        timeout_ms_env = os.environ.get("BASH_MAX_TIMEOUT_MS")
+        timeout_ms = int(timeout_ms_env) if timeout_ms_env is not None else DEFAULT_TIMEOUT_MS
+    except ValueError:
+        timeout_ms = DEFAULT_TIMEOUT_MS
+
+    base_prompt = f"""You are an experienced software engineer.
+
+ENVIRONMENT SETUP:
+- You are running inside a Docker container with root permissions.
+- The artifact repository should be in the current working directory or nearby.
+- You should explore the directory structure to find the artifact repository.
+{work_dir_instruction}- You have access to Read, Write, and Bash tools to complete the task.
+
+YOUR TASK:
+{task_description}
+
+TIMEOUT CONFIGURATION (CRITICAL):
+- The system has been configured with a default Bash timeout of {timeout_ms} ms (via BASH_MAX_TIMEOUT_MS).
+- DO NOT specify timeout parameters in your Bash commands - the system default will be used automatically.
+- Long-running commands (builds, tests, benchmarks) can take hours - this is normal and expected.
+- If a command seems to be running long, DO NOT cancel or re-run it. Wait for completion.
+
+IMPORTANT GUIDELINES:
+1. First, explore the current directory structure to understand where you are and where the artifact is located.
+2. Navigate to the artifact repository root directory.
+3. If you see 'sudo' in any instructions, remove it (you already have root access).
+4. Do NOT attempt to switch git branches (you are already on the correct branch).
+5. Follow the README instructions step by step.
+6. You MUST execute every verification step, test, or command that the README (or referenced docs like TESTBED.md) says is required for evaluation or reproduction. Do NOT skip any such step just because the README mentions that it may take a long time. Long runtimes are expected; run each verification and wait for completion.
+7. Use the Bash tool to run commands, Read tool to inspect files, and Write tool to create/modify files.
+8. Work systematically through environment setup, build/install, benchmark preparation, and experiment execution.
+9. If you encounter errors, try to debug and resolve them using the available tools.
+10. For long-running commands, let them complete naturally. Do NOT set short timeouts or interrupt them."""
+
+    options = ClaudeAgentOptions(
+        system_prompt=base_prompt,
+        allowed_tools=["Read", "Write", "Bash"],
+        setting_sources=["user"],
+    )
+
+    formatter = None
+    if FORMATTER_AVAILABLE:
+        try:
+            formatter = MessageFormatter()
+            formatter.print_header()
+        except Exception as e:
+            print(f"WARNING: Failed to initialize MessageFormatter: {e}", file=sys.stderr)
+
+    print(f"\n{'='*60}", flush=True)
+    print(f"Starting AE Agent (Claude SDK) with model: {model_name}", flush=True)
+    print(f"Task: {task_description[:200]}..." if len(task_description) > 200 else f"Task: {task_description}", flush=True)
+    print(f"{'='*60}\n", flush=True)
+
+    last_exception = None
+    for attempt in range(1, RATE_LIMIT_MAX_RETRIES + 1):
+        try:
+            result_text = ""
+            message_count = 0
+
+            async for message in query(
+                prompt="Please start working on the artifact task described in the system prompt. Begin by changing to the artifact repository directory and examining the README or instructions.",
+                options=options
+            ):
+                message_count += 1
+                if message_count % 10 == 0:
+                    print(f"[Progress] Processed {message_count} messages...", flush=True)
+
+                if formatter:
+                    try:
+                        formatter.format_message(message)
+                    except Exception as e:
+                        print(f"WARNING: Failed to format message: {e}", file=sys.stderr, flush=True)
+                        print(str(message), flush=True)
+                else:
+                    print(str(message), flush=True)
+
+                msg_str = str(message)
+                if 'ResultMessage' in msg_str or 'TextBlock' in msg_str:
+                    result_text = msg_str
+
+            if formatter:
+                formatter.print_footer()
+
+            print(f"\n{'='*60}", flush=True)
+            print(f"AE Agent execution completed. Total messages: {message_count}", flush=True)
+            print(f"{'='*60}\n", flush=True)
+
+            if formatter:
+                try:
+                    metadata = formatter.get_api_metadata()
+                    if metadata:
+                        print(f"\nAPI Usage Metadata:", flush=True)
+                        print(f"  Input tokens: {metadata.get('input_tokens', 'N/A')}", flush=True)
+                        print(f"  Output tokens: {metadata.get('output_tokens', 'N/A')}", flush=True)
+                        print(f"  Total cost: ${metadata.get('total_cost', 'N/A')}", flush=True)
+                except Exception as e:
+                    print(f"WARNING: Failed to get metadata: {e}", file=sys.stderr, flush=True)
+
+            return 0
+
+        except asyncio.TimeoutError as e:
+            print(f"\nERROR: AE Agent execution timed out: {e}", file=sys.stderr, flush=True)
+            if formatter:
+                formatter.print_footer()
+            return 1
+        except Exception as e:
+            last_exception = e
+            explicit_429 = _is_rate_limit_error(e)
+            wrapped_possible_429 = _is_possible_wrapped_rate_limit(e) and not explicit_429
+            max_retries = RATE_LIMIT_MAX_RETRIES if explicit_429 else RATE_LIMIT_WRAPPED_MAX_RETRIES
+            is_retriable = (explicit_429 or wrapped_possible_429) and attempt < max_retries
+            if is_retriable:
+                parsed = _parse_retry_after_seconds(e)
+                wait_sec = min(parsed, RATE_LIMIT_WAIT_MAX_SEC) if parsed is not None else min(
+                    RATE_LIMIT_WAIT_SEC * (2 ** (attempt - 1)), RATE_LIMIT_WAIT_MAX_SEC
+                )
+                print(
+                    f"\nRate limit or API error. Waiting {wait_sec}s before retry (attempt {attempt}/{max_retries})...",
+                    file=sys.stderr, flush=True,
+                )
+                await asyncio.sleep(wait_sec)
+                continue
+            print(f"\nERROR: AE Agent execution failed: {e}", file=sys.stderr, flush=True)
+            import traceback
+            traceback.print_exc(file=sys.stderr)
+            sys.stderr.flush()
+            if formatter:
+                formatter.print_footer()
+            return 1
+
+    if last_exception:
+        print(f"\nERROR: AE Agent failed after {RATE_LIMIT_MAX_RETRIES} attempts: {last_exception}", file=sys.stderr, flush=True)
+    return 1
+
+
+def main():
+    if len(sys.argv) != 3:
+        print("Usage: python3 runner.py <model_name> <task_description_or_path>", file=sys.stderr)
+        print("Example: python3 runner.py claude-sonnet-4-5-20250929 /agent/current_task.txt", file=sys.stderr)
+        sys.exit(1)
+
+    model_name = sys.argv[1]
+    task_arg = sys.argv[2]
+    if os.path.isfile(task_arg):
+        with open(task_arg, 'r', encoding='utf-8') as f:
+            task_description = f.read()
+    else:
+        task_description = task_arg
+
+    if not os.environ.get('ANTHROPIC_API_KEY') and not os.environ.get('ANTHROPIC_FOUNDRY_API_KEY'):
+        print("ERROR: ANTHROPIC_API_KEY or ANTHROPIC_FOUNDRY_API_KEY must be set.", file=sys.stderr)
+        sys.exit(1)
+
+    try:
+        timeout_ms_env = os.environ.get("BASH_MAX_TIMEOUT_MS")
+        timeout_ms = int(timeout_ms_env) if timeout_ms_env is not None else DEFAULT_TIMEOUT_MS
+    except ValueError:
+        timeout_ms = DEFAULT_TIMEOUT_MS
+    timeout_s = timeout_ms / 1000.0
+
+    try:
+        exit_code = asyncio.run(
+            asyncio.wait_for(
+                run_agent(model_name, task_description),
+                timeout=timeout_s,
+            )
+        )
+    except asyncio.TimeoutError:
+        print(f"ERROR: Agent execution exceeded timeout ({timeout_s} seconds).", file=sys.stderr, flush=True)
+        sys.exit(1)
+    except Exception as e:
+        print(f"ERROR: Failed to run agent: {e}", file=sys.stderr, flush=True)
+        import traceback
+        traceback.print_exc(file=sys.stderr)
+        sys.stderr.flush()
+        sys.exit(1)
+
+    sys.exit(exit_code)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/benchmarks/arteval_bench/src/agents/ae_agent/runner.sh b/benchmarks/arteval_bench/src/agents/ae_agent/runner.sh
new file mode 100644
index 00000000..090f8941
--- /dev/null
+++ b/benchmarks/arteval_bench/src/agents/ae_agent/runner.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+# AE Agent runner for ArtEvalBench. Invoked as: runner.sh <model> <task_or_path>
+# Do not use set -e; some commands may return non-zero without indicating failure.
+
+if [ $# -ne 2 ]; then
+    echo "Usage: $0 <model_location> <task_description_or_path>"
+    echo "Example: $0 claude-sonnet-4-5-20250929 /agent/current_task.txt"
+    exit 1
+fi
+
+export ANTHROPIC_API_KEY="${ANTHROPIC_API_KEY}"
+export PYTHONUNBUFFERED=1
+
+# 48h = 172800000 ms (align with benchmark long-running agent timeout)
+if [ -z "$BASH_MAX_TIMEOUT_MS" ]; then
+    export BASH_MAX_TIMEOUT_MS=172800000
+fi
+if [ -z "$BASH_DEFAULT_TIMEOUT_MS" ]; then
+    export BASH_DEFAULT_TIMEOUT_MS="$BASH_MAX_TIMEOUT_MS"
+fi
+
+# Invoke Python runner (-u for unbuffered output). Second arg can be task text or path to file.
+python3 -u /agent/runner.py "$1" "$2"
diff --git a/benchmarks/arteval_bench/src/agents/ae_agent/utils.py b/benchmarks/arteval_bench/src/agents/ae_agent/utils.py
new file mode 100644
index 00000000..fda8c0a6
--- /dev/null
+++ b/benchmarks/arteval_bench/src/agents/ae_agent/utils.py
@@ -0,0 +1,4 @@
+"""Helper for AE Agent runner (timeout constant used by runner.py)."""
+
+# Default total timeout in milliseconds (48h); used by runner when BASH_MAX_TIMEOUT_MS is unset.
+DEFAULT_TIMEOUT_MS = 172_800_000
diff --git a/benchmarks/arteval_bench/src/main.py b/benchmarks/arteval_bench/src/main.py
new file mode 100644
index 00000000..df3e6c49
--- /dev/null
+++ b/benchmarks/arteval_bench/src/main.py
@@ -0,0 +1,125 @@
+"""This script runs a benchmark for evaluating patches in a software project."""
+
+import argparse
+import json
+import os
+import sys
+from datetime import datetime
+
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../../../')))
+
+from sdk.logger import logger
+from sdk.utils import set_llm_endpoint_from_config
+
+set_llm_endpoint_from_config('env.toml')
+
+from run_eval_in_env import run_eval
+from utils import get_task
+
+def main(file_path, model, agent, save_path):
+    """Main function for running the benchmark."""
+    logger.info(f'Using model: {model}, agent: {agent}')
+    with open(file_path) as f:
+        for line in f:
+            if not line.strip():
+                continue  # Skip empty lines
+
+            try:
+                item = json.loads(line)
+            except json.JSONDecodeError:
+                logger.info(f'Skipping invalid JSON line: {line}')
+                continue
+
+            deployment = item.get('docker_env', None)
+            project_path = f"./data/benchmark/{item.get('artifact_dir', None)}"
+            task_file = item.get('artifact_readme', None)
+            task_id = item.get('artifact_id', None)
+            test_method = item.get('evaluator', None)
+            run_on_host = item.get('run_on_host', False)
+
+            task = get_task(task_file)
+
+            logger.info(f"Task {task_id}: run_on_host={run_on_host}")
+
+            result = run_eval(
+                deployment=deployment,
+                project_path=project_path,
+                task_id=task_id,
+                task=task,
+                model=model,
+                agent_path=agent,
+                test_method=test_method,
+                save_path=save_path,
+                run_on_host=run_on_host,  # Pass the flag
+            )
+
+            result['expected_score'] = item.get('expected_score', -1)
+            with open(f'{save_path}/result.jsonl', 'a+', encoding='utf-8') as fw:
+                fw.write(json.dumps(result) + '\n')
+
+    success_count = 0
+    total_count = 0
+    with open(f'{save_path}/result.jsonl', encoding='utf-8') as f:
+        for line in f:
+            result = json.loads(line.strip())
+            if result.get('status') == 'success':
+                success_count += (result.get('score') == result.get('expected_score', -1))
+            total_count += 1
+    logger.info(f'Test run completed: {success_count}/{total_count} tasks succeeded.')
+    summary_data = {'final_score': success_count / total_count, 'total_tasks': total_count}
+
+    with open(os.path.join(save_path, 'avg_score.json'), 'w', encoding='utf-8') as summary_file:
+        json.dump(summary_data, summary_file, indent=4)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='example benchmark')
+    parser.add_argument(
+        '-i',
+        '--input_file',
+        help='Benchmark input file',
+        default='./data/benchmark/arteval_tasks.jsonl',
+        #default='./data/benchmark/env_setup_examples.jsonl',
+    )
+    parser.add_argument('-o', '--save_path', help='Result save path', default=None)
+    parser.add_argument(
+        '-a',
+        '--agent',
+        help='Agent Name',
+        default='claudecode',
+    )
+    parser.add_argument(
+        '-m',
+        '--model_name',
+        help='Model Name',
+        default='claude-sonnet-4-5-20250929',
+    )
+    # Note that if your benchmark has multiple tasks, you need to add --task <task>
+    # in your code to enable task selection.
+    parser.add_argument('-t', '--task', help='specify task in scenarios', default=None)
+
+    args = parser.parse_args()
+
+    model_name = args.model_name
+    agent = args.agent
+    input_file = args.input_file
+    save_path = args.save_path
+    task = args.task
+
+    logger.debug(f"Benchmark path: {input_file}")
+
+    if save_path is None:
+        str_model_name = model_name.replace('/', '_')
+        timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
+        save_path = os.path.join('./outputs', f'env_setup_project__{str_model_name}__{args.agent}__{timestamp}')
+
+    if agent == 'claudecode':
+        agent = './src/agents/claudecode'
+    elif agent == 'claude_sdk':
+        agent = './src/agents/claude_sdk'
+    elif agent == 'ae_agent' or agent == 'ae-agent':
+        agent = './src/agents/ae_agent'
+    save_path = os.path.abspath(os.path.expanduser(save_path))
+    os.makedirs(save_path, exist_ok=True)
+
+    main(input_file, model_name, agent, save_path)
diff --git a/benchmarks/arteval_bench/src/run_eval_in_env.py b/benchmarks/arteval_bench/src/run_eval_in_env.py
new file mode 100644
index 00000000..190fadb0
--- /dev/null
+++ b/benchmarks/arteval_bench/src/run_eval_in_env.py
@@ -0,0 +1,650 @@
+"""Patch evaluator for running tests in a deployment."""
+
+import asyncio
+import json
+import os
+import re
+import subprocess
+import sys
+import tempfile
+import shutil
+from pathlib import Path
+
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../../../')))
+
+from swerex.deployment.docker import DockerDeploymentConfig
+from swerex.runtime.abstract import BashAction, Command, CreateBashSessionRequest, UploadRequest
+
+from sdk.logger import logger
+
+
+def _parse_eval_score(output) -> int:
+    """Parse evaluation score from BashObservation or string output.
+
+    - If a line is a single digit (e.g. '4', '0'), use it (prefer last such line).
+    - If output contains 'Agent scores: {...}' (Oracle-style evaluator), count ': 1' as passed items.
+    - Otherwise return 0.
+    """
+    s = (getattr(output, "output", None) or str(output) or "").strip()
+    if not s:
+        return 0
+    lines = s.splitlines()
+    for line in reversed(lines):
+        t = line.strip()
+        if t.isdigit():
+            return int(t)
+    m = re.search(r"Agent scores:\s*\{[^}]*\}", s)
+    if m:
+        return m.group(0).count(": 1")
+    return 0
+
+
+def write_to_file(file_path, content):
+    """Write content to a file."""
+    with open(file_path, 'w') as f:
+        f.write(content)
+
+
+def setup_claude_settings_on_host():
+    """Set up ~/.claude/settings.json with timeout configuration on host."""
+    claude_dir = Path.home() / ".claude"
+    settings_file = claude_dir / "settings.json"
+    
+    claude_dir.mkdir(exist_ok=True)
+    
+    settings = {
+        "env": {
+            "BASH_MAX_TIMEOUT_MS": "172800000",  # 48 hours
+            "BASH_DEFAULT_TIMEOUT_MS": "172800000"
+        }
+    }
+    
+    with open(settings_file, 'w') as f:
+        json.dump(settings, f, indent=2)
+    
+    logger.info(f"Created {settings_file} with 48-hour timeout configuration.")
+
+
+async def run_eval_on_host(project_path, task_id, task, model, agent_path, test_method, save_path):
+    """Run evaluation directly on host machine (no Docker container).
+    
+    This is useful for tasks that require Kind clusters or other Docker-in-Docker
+    scenarios that don't work well in nested containers.
+    """
+    logger.info("=" * 80)
+    logger.info("Running evaluation directly on HOST MACHINE (not in Docker)")
+    logger.info("=" * 80)
+    
+    # Check prerequisites
+    import shutil
+    
+    if not shutil.which("docker"):
+        raise RuntimeError("Docker is not installed on host")
+    
+    # Check if Docker is running
+    result = subprocess.run(["docker", "ps"], capture_output=True, timeout=10)
+    if result.returncode != 0:
+        raise RuntimeError("Docker is not running on host")
+    
+    # Check API key
+    if not os.environ.get("ANTHROPIC_API_KEY"):
+        raise RuntimeError("ANTHROPIC_API_KEY environment variable is not set")
+    
+    # Setup Claude settings
+    setup_claude_settings_on_host()
+    
+    # Ensure project path is absolute
+    project_path = os.path.abspath(project_path)
+    if not os.path.isdir(project_path):
+        raise RuntimeError(f"Project path does not exist: {project_path}")
+    
+    logger.info(f"Project path: {project_path}")
+    logger.info(f"Task ID: {task_id}")
+    logger.info(f"Model: {model}")
+    
+    # Import Claude Agent SDK
+    try:
+        from claude_agent_sdk import query, ClaudeAgentOptions
+    except ImportError as e:
+        raise RuntimeError(f"claude_agent_sdk not installed: {e}. Install with: pip install claude-agent-sdk")
+    
+    # Build system prompt for host execution
+    system_prompt = f"""You are an experienced software engineer completing an artifact evaluation task.
+
+ENVIRONMENT SETUP (HOST MACHINE - NOT DOCKER):
+- You are running DIRECTLY on the host machine (NOT inside a Docker container)
+- Docker daemon is already running on this host
+- When you use Kind to create Kubernetes clusters, they will be created using the host's Docker
+- This avoids Docker-in-Docker compatibility issues
+- You may need sudo for some operations
+
+ARTIFACT LOCATION:
+- The artifact repository is located at: {project_path}
+- Start by changing to this directory: cd {project_path}
+
+YOUR TASK:
+{task}
+
+TIMEOUT CONFIGURATION (CRITICAL):
+- Long-running commands (builds, tests, Kind cluster creation) are expected
+- DO NOT set short timeouts - let commands complete naturally
+- Kind cluster creation can take 5-10 minutes
+- Full benchmark runs can take hours
+
+IMPORTANT GUIDELINES:
+1. First, cd to {project_path} and examine the directory structure
+2. Follow the README instructions step by step
+3. If you see 'sudo' in instructions, you can use it (or skip if already root)
+4. Use the Bash tool to run commands, Read tool to inspect files
+5. Work systematically through setup, build, and experiment execution
+6. If you encounter errors, debug and resolve them using available tools
+7. For Kind clusters, they will work properly since you're on the host (not DinD)"""
+
+    options = ClaudeAgentOptions(
+        system_prompt=system_prompt,
+        allowed_tools=["Read", "Write", "Bash"],
+        setting_sources=["user"],  # Load ~/.claude/settings.json for timeout config
+    )
+    
+    # Set environment variables
+    os.environ['BASH_MAX_TIMEOUT_MS'] = '172800000'
+    os.environ['BASH_DEFAULT_TIMEOUT_MS'] = '172800000'
+    
+    logger.info("Starting Claude Agent SDK (Host Mode)...")
+    
+    message_count = 0
+    run_results_output = ""
+    
+    try:
+        async for message in query(
+            prompt=f"Please start the artifact evaluation task. Begin by changing to the artifact directory at {project_path} and examining its contents.",
+            options=options
+        ):
+            message_count += 1
+            
+            if message_count % 10 == 0:
+                logger.info(f"[Progress] Processed {message_count} messages...")
+            
+            # Log each message
+            msg_str = str(message)
+            logger.info(msg_str)
+            
+            if 'ResultMessage' in msg_str or 'TextBlock' in msg_str:
+                run_results_output = msg_str
+        
+        logger.info(f"Claude Agent SDK execution completed. Total messages: {message_count}")
+        
+    except Exception as e:
+        logger.error(f"Claude Agent SDK execution failed: {e}")
+        import traceback
+        traceback.print_exc()
+        run_results_output = f"Error: {e}"
+    
+    # Run evaluation (test_method)
+    logger.info("Running evaluation script...")
+    try:
+        # Change to project directory and run test
+        eval_cmd = f"cd {project_path} && {test_method}"
+        eval_result = subprocess.run(
+            eval_cmd,
+            shell=True,
+            capture_output=True,
+            text=True,
+            timeout=300  # 5 minute timeout for evaluation
+        )
+        test_output = eval_result.stdout.strip()
+        logger.info(f"Evaluation output: {test_output}")
+        
+        result = {
+            'task': task,
+            'project_path': project_path,
+            'agent_run_results': run_results_output,
+            'test_method': test_method,
+            'score': int(test_output) if test_output.isdigit() else 0,
+            'status': 'success',
+            'run_on_host': True,
+        }
+    except Exception as e:
+        logger.error(f"Error running test method: {e}")
+        result = {
+            'task': task,
+            'project_path': project_path,
+            'agent_run_results': run_results_output,
+            'test_method': test_method,
+            'score': 0,
+            'status': f'error: {str(e)}',
+            'run_on_host': True,
+        }
+    
+    return result
+
+
+async def run_eval_in_env(deployment, project_path, task_id, task, model, agent_path, test_method, save_path):
+    """Spoiler: This function will work with any deployment."""
+    await deployment.start()
+    runtime = deployment.runtime
+
+    if hasattr(runtime, "_config"):
+        logger.info(f"Current RemoteRuntime timeout: {runtime._config.timeout}s")
+        # 48 hours = 172800s (aligned with Bash command timeout)
+        runtime._config.timeout = 172800.0
+        logger.info(f"Overriding RemoteRuntime timeout to {runtime._config.timeout}s (48 hours)")
+
+    # Issue a few one-off commands, similar to `subprocess.run()`
+    logger.info(await runtime.execute(Command(command=['echo', 'Hello, world!'])))
+
+    # Create a bash session
+    await runtime.create_session(CreateBashSessionRequest())
+    # Run a command in the session
+    # The difference to the one-off commands is that environment state persists!
+    logger.info(await runtime.run_in_session(BashAction(command="export MYVAR='test'")))
+    logger.info(await runtime.run_in_session(BashAction(command='echo $MYVAR')))
+
+    logger.info('Uploading project files...')
+    logger.info(
+        await runtime.upload(
+            UploadRequest(
+                source_path=project_path,
+                target_path='/repo',
+            )
+        )
+    )
+    logger.info('Project files uploaded.')
+    
+    # Long-running agents (claude_sdk, ae_agent): remove eval script dirs so the agent cannot see evaluation logic
+    is_claude_sdk = str(agent_path).endswith('claude_sdk')
+    is_ae_agent = str(agent_path).endswith('ae_agent')
+    is_long_running_agent = is_claude_sdk or is_ae_agent
+    agent_label = 'ae_agent' if is_ae_agent else 'claude_sdk'
+    if is_long_running_agent:
+        logger.info(f'Removing _agent_eval directories for {agent_label} to prevent answer leakage...')
+        await runtime.run_in_session(
+            BashAction(command='find /repo -type d -name "_agent_eval" -exec rm -rf {} + 2>/dev/null || true', timeout=30.0)
+        )
+        logger.info('_agent_eval directories removed.')
+    
+    run_results = await runtime.run_in_session(BashAction(command='cd /repo'))
+    logger.info(run_results)
+    run_results = await runtime.run_in_session(BashAction(command='pwd'))
+    logger.info(f'Current directory: {run_results}')
+    run_results = await runtime.run_in_session(BashAction(command='ls'))
+    logger.info(f'Current directory contents: {run_results}')
+
+    logger.info('Uploading agent runner script...')
+    logger.info(
+        await runtime.upload(
+            UploadRequest(
+                source_path=agent_path,
+                target_path='/agent',
+            )
+        )
+    )
+    logger.info(await runtime.run_in_session(BashAction(command='ls /agent/runner.sh')))
+    logger.info('Agent runner script uploaded.')
+
+    logger.info('Setup the agent running environment...')
+    logger.info(await runtime.run_in_session(BashAction(command='chmod +x /agent/runner.sh /agent/install.sh')))
+    logger.info(await runtime.run_in_session(BashAction(command='cat /agent/runner.sh')))
+    logger.info(await runtime.run_in_session(BashAction(command='/agent/install.sh')))
+    
+    # Set required env vars for long-running agents (passed from host into container)
+    if is_long_running_agent:
+        parts = []
+        anthropic_api_key = os.environ.get('ANTHROPIC_API_KEY')
+        foundry_api_key = os.environ.get('ANTHROPIC_FOUNDRY_API_KEY')
+        if anthropic_api_key:
+            escaped_key = anthropic_api_key.replace("'", "'\"'\"'")
+            parts.append(f"export ANTHROPIC_API_KEY='{escaped_key}'")
+        if foundry_api_key:
+            escaped_foundry = foundry_api_key.replace("'", "'\"'\"'")
+            parts.append(f"export ANTHROPIC_FOUNDRY_API_KEY='{escaped_foundry}'")
+            if not anthropic_api_key:
+                parts.append(f"export ANTHROPIC_API_KEY='{escaped_foundry}'")
+        foundry_base = os.environ.get('ANTHROPIC_FOUNDRY_BASE_URL')
+        if foundry_base:
+            escaped_url = foundry_base.replace("'", "'\"'\"'")
+            parts.append(f"export ANTHROPIC_FOUNDRY_BASE_URL='{escaped_url}'")
+        if os.environ.get('CLAUDE_CODE_USE_FOUNDRY') == '1':
+            parts.append("export CLAUDE_CODE_USE_FOUNDRY=1")
+        if parts:
+            set_env_cmd = " && ".join(parts)
+            logger.info('Setting Anthropic/Foundry API key and env in container...')
+            logger.info(await runtime.run_in_session(BashAction(command=set_env_cmd)))
+        if not anthropic_api_key and not foundry_api_key:
+            logger.warning('Neither ANTHROPIC_API_KEY nor ANTHROPIC_FOUNDRY_API_KEY found. Runner may fail.')
+
+    # For ae_agent: upload task to /agent/current_task.txt to avoid shell quoting with large tasks
+    if is_ae_agent:
+        tmpdir = tempfile.mkdtemp(prefix='ae_agent_task_')
+        try:
+            task_file_host = os.path.join(tmpdir, 'current_task.txt')
+            with open(task_file_host, 'w', encoding='utf-8') as f:
+                f.write(task)
+            await runtime.upload(UploadRequest(source_path=tmpdir, target_path='/agent_task_file'))
+            await runtime.run_in_session(BashAction(command='cp /agent_task_file/current_task.txt /agent/current_task.txt', timeout=10.0))
+        finally:
+            shutil.rmtree(tmpdir, ignore_errors=True)
+        logger.info('Task file uploaded to /agent/current_task.txt for ae_agent.')
+
+    logger.info('Running runner script...')
+    runner_timeout = 172800.0 if is_long_running_agent else 1200.0  # 48h for claude_sdk/ae_agent
+
+    if is_long_running_agent:
+        # Live log monitoring: run runner in background, poll log file periodically
+        await runtime.run_in_session(BashAction(command='rm -f /agent/runner.live.log && touch /agent/runner.live.log', timeout=10.0))
+
+        # ae_agent: use task file to avoid shell quoting; others pass task string
+        if is_ae_agent:
+            start_cmd = (
+                'stdbuf -oL -eL /agent/runner.sh "' + model + '" /agent/current_task.txt > /agent/runner.live.log 2>&1 & '
+                'RUNNER_PID=$!; '
+                'sleep 1; '
+                'echo RUNNER_PID=$RUNNER_PID'
+            )
+        else:
+            start_cmd = (
+                f'bash -c "stdbuf -oL -eL /agent/runner.sh \\"{model}\\" \\"{task}\\" > /agent/runner.live.log 2>&1 & '
+                'RUNNER_PID=$!; '
+                'sleep 1; '
+                'echo RUNNER_PID=$RUNNER_PID"'
+            )
+        start_res = await runtime.run_in_session(BashAction(command=start_cmd, timeout=30.0))
+        start_output = str(getattr(start_res, "output", "")).strip()
+
+        pid = None
+        for line in start_output.split('\n'):
+            if 'RUNNER_PID=' in line:
+                pid = line.split('RUNNER_PID=', 1)[1].strip()
+                break
+
+        if not pid or not pid.isdigit():
+            # Fallback: find PID by process name after short delay
+            await asyncio.sleep(2)
+            ps_res = await runtime.run_in_session(
+                BashAction(command="ps aux | grep '[r]unner.py' | awk '{print $2}' | head -1", timeout=10.0)
+            )
+            pid = str(getattr(ps_res, "output", "")).strip()
+        
+        logger.info(f'{agent_label} runner started with pid: {pid}')
+
+        await asyncio.sleep(2)  # Allow log file to have content
+
+        elapsed = 0.0
+        poll_interval = 10.0  # Poll every 10s for live log
+        run_results = None
+        last_log_content = ""  # Track last read content to avoid duplicate output
+
+        while elapsed < runner_timeout:
+            try:
+                log_res = await runtime.run_in_session(
+                    BashAction(command='cat /agent/runner.live.log 2>/dev/null || echo ""', timeout=30.0)
+                )
+                current_log_content = str(getattr(log_res, "output", "")).strip()
+
+                if current_log_content and current_log_content != last_log_content:
+                    if last_log_content and current_log_content.startswith(last_log_content):
+                        new_content = current_log_content[len(last_log_content):].strip()
+                        if new_content:
+                            logger.info(f'[{agent_label} live log @ {elapsed:.0f}s ({elapsed/60:.1f} min)]\n{new_content}')
+                    else:
+                        logger.info(f'[{agent_label} live log @ {elapsed:.0f}s ({elapsed/60:.1f} min)]\n{current_log_content}')
+                    last_log_content = current_log_content
+                elif elapsed % 300 == 0 and elapsed > 0:
+                    logger.info(f'[{agent_label} still running @ {elapsed:.0f}s ({elapsed/60:.1f} min), no new output]')
+            except Exception as e:
+                logger.info(f'Failed to read {agent_label} live log: {e}')
+
+            if pid and pid.isdigit():
+                ps_res = await runtime.run_in_session(
+                    BashAction(command=f'ps -p {pid} >/dev/null 2>&1; echo $?', timeout=10.0)
+                )
+                ps_code = str(getattr(ps_res, "output", "")).strip()
+                if ps_code != "0":
+                    wait_res = await runtime.run_in_session(
+                        BashAction(command=f'wait {pid} 2>/dev/null; echo $?', timeout=30.0)
+                    )
+                    exit_code_str = str(getattr(wait_res, "output", "")).strip()
+
+                    class MockResult:
+                        def __init__(self, code):
+                            self.exit_code = int(code) if code.isdigit() else 0
+                            self.output = f'exit_code={self.exit_code}'
+                    run_results = MockResult(exit_code_str)
+                    logger.info(f'{agent_label} runner finished with exit code: {run_results.exit_code}')
+                    break
+            else:
+                ps_res = await runtime.run_in_session(
+                    BashAction(command="ps aux | grep '[r]unner.py' | wc -l", timeout=10.0)
+                )
+                proc_count = str(getattr(ps_res, "output", "")).strip()
+                if proc_count == "0" or not proc_count.isdigit() or int(proc_count) == 0:
+                    logger.info(f'{agent_label} runner process not found, assuming finished')
+                    class MockResult:
+                        def __init__(self):
+                            self.exit_code = 0
+                            self.output = 'exit_code=0'
+                    run_results = MockResult()
+                    break
+
+            await asyncio.sleep(poll_interval)
+            elapsed += poll_interval
+
+        if run_results is None:
+            # Timeout: try to kill process and capture final log
+            if pid and pid.isdigit():
+                try:
+                    await runtime.run_in_session(BashAction(command=f'kill -TERM {pid} 2>/dev/null || kill -9 {pid} 2>/dev/null || true', timeout=10.0))
+                except Exception:
+                    pass
+            try:
+                tail_log = await runtime.run_in_session(
+                    BashAction(command='tail -n 200 /agent/runner.live.log', timeout=30.0)
+                )
+                logger.info(f'{agent_label} live log tail (on timeout):\n{tail_log}')
+            except Exception as e:
+                logger.info(f'Failed to read {agent_label} live log after timeout: {e}')
+            raise TimeoutError(f'{agent_label} runner exceeded timeout {runner_timeout}s')
+
+    else:
+        runner_cmd = f'/agent/runner.sh "{model}" "{task}"'
+        run_results = await runtime.run_in_session(BashAction(command=runner_cmd, timeout=runner_timeout))
+    logger.info(f"agent's run results: {run_results}")
+    logger.info('Runner script finished.')
+
+    # For long-running agents: upload eval scripts before running evaluation
+    if is_long_running_agent:
+        logger.info(f'Uploading _agent_eval directories for evaluation ({agent_label})...')
+        eval_dirs = []
+        for root, dirs, files in os.walk(project_path):
+            if '_agent_eval' in dirs:
+                eval_source_path = os.path.join(root, '_agent_eval')
+                rel_path = os.path.relpath(eval_source_path, project_path)
+                eval_dirs.append((eval_source_path, rel_path))
+
+        if eval_dirs:
+            for eval_source_path, rel_path in eval_dirs:
+                target_eval_path = os.path.join('/repo', rel_path)
+                logger.info(f'Uploading _agent_eval from {eval_source_path} to {target_eval_path}')
+                try:
+                    await runtime.upload(
+                        UploadRequest(
+                            source_path=eval_source_path,
+                            target_path=target_eval_path,
+                        )
+                    )
+                    logger.info(f'_agent_eval directory uploaded: {rel_path}')
+                except Exception as e:
+                    logger.warning(f'Failed to upload _agent_eval from {eval_source_path}: {e}')
+            logger.info('All _agent_eval directories uploaded for evaluation.')
+        else:
+            logger.warning(f'No _agent_eval directories found in {project_path}')
+
+    try:
+        test_output = await runtime.run_in_session(BashAction(command=test_method))
+        logger.info(test_output)
+        result = {
+            'task': task,
+            'project_path': project_path,
+            'agent_run_results': run_results.output if hasattr(run_results, 'output') else str(run_results),
+            'test_method': test_method,
+            'score': _parse_eval_score(test_output),
+            'status': 'success',
+        }
+    except Exception as e:
+        logger.info(f'Error running test method: {e}')
+        result = {
+            'task': task,
+            'project_path': project_path,
+            'agent_run_results': run_results.output if hasattr(run_results, 'output') else str(run_results),
+            'test_method': test_method,
+            'score': 0,
+            'status': f'error: {str(e)}',
+        }
+
+    # For long-running agents: keep container running for inspection
+    if is_long_running_agent:
+        logger.info('=' * 80)
+        logger.info(f'Keeping Docker container running for {agent_label} (for debugging purposes).')
+
+        container_id = "unknown"
+        container_name = "unknown"
+        try:
+            container_id_res = await runtime.run_in_session(
+                BashAction(command='cat /etc/hostname 2>/dev/null || hostname 2>/dev/null || echo "unknown"', timeout=10.0)
+            )
+            container_id = str(getattr(container_id_res, "output", "")).strip()
+
+            try:
+                docker_info_res = await runtime.run_in_session(
+                    BashAction(command='cat /proc/self/cgroup 2>/dev/null | grep docker | head -1 | cut -d/ -f3 | cut -c1-12 || echo ""', timeout=10.0)
+                )
+                docker_container_id = str(getattr(docker_info_res, "output", "")).strip()
+                if docker_container_id:
+                    container_id = docker_container_id
+            except Exception:
+                pass
+
+            if hasattr(deployment, '_container_id'):
+                container_id = deployment._container_id
+            elif hasattr(deployment, 'container_id'):
+                container_id = deployment.container_id
+            if hasattr(deployment, '_container_name'):
+                container_name = deployment._container_name
+            elif hasattr(deployment, 'container_name'):
+                container_name = deployment.container_name
+        except Exception as e:
+            logger.warning(f'Failed to get container information: {e}')
+        
+        logger.info(f'Container Information:')
+        logger.info(f'  Container ID: {container_id}')
+        logger.info(f'  Container Name: {container_name}')
+        logger.info(f'  Task ID: {task_id}')
+        logger.info(f'  Project Path: {project_path}')
+        logger.info(f'  To inspect the container, use: docker exec -it {container_id} /bin/bash')
+        logger.info(f'  Or find container by name/image and inspect manually')
+        logger.info(f'  NOTE: Container will remain running. To stop it manually, use: docker stop {container_id}')
+        logger.info(f'  WARNING: Remember to clean up containers to save storage space!')
+        logger.info('=' * 80)
+
+        result['container_id'] = container_id
+        result['container_name'] = container_name
+        result['container_kept'] = True
+    else:
+        await deployment.stop()
+        result['container_kept'] = False
+
+    
+    return result
+
+
+def run_eval(deployment, project_path, task_id, task, model, agent_path, test_method, save_path, run_on_host=False):
+    """Run evaluation either on host or in Docker container.
+    
+    Args:
+        deployment: Docker image to use (ignored if run_on_host=True)
+        project_path: Path to the artifact project
+        task_id: Task identifier
+        task: Task description
+        model: Model name
+        agent_path: Path to agent scripts
+        test_method: Evaluation command
+        save_path: Path to save results
+        run_on_host: If True, run directly on host machine instead of Docker
+    """
+    
+    if run_on_host:
+        # Run directly on host machine (no Docker container)
+        logger.info(f"Task {task_id} configured to run on HOST machine (run_on_host=True)")
+        return asyncio.run(
+            run_eval_on_host(project_path, task_id, task, model, agent_path, test_method, save_path)
+        )
+    
+    # Run in Docker container (original behavior)
+    image = deployment or 'bastoica/ae-agent-ubuntu24.04:latest'
+
+    # Enable privileged mode for Docker-in-Docker scenarios (e.g., Kind clusters)
+    # This is required for Kubernetes-based artifact evaluations like Acto
+    # Additional args for cgroups v2 compatibility:
+    # - --cgroupns=host: Share cgroup namespace with host (required for Kind in cgroups v2)
+    # - Environment variables for Kind cgroups v2 compatibility
+    config = DockerDeploymentConfig(
+        image=image,
+        startup_timeout=1200.0,
+        docker_args=[
+            '--privileged',  # Required for Kind cluster creation
+            '--cgroupns=host',  # Required for Kind nodes to start with cgroups v2
+            '-e', 'KIND_EXPERIMENTAL_CONTAINERD_SNAPSHOTTER=native',  # Better cgroups v2 support
+        ],
+    )
+    deployment_obj = config.get_deployment()
+
+    return asyncio.run(
+        run_eval_in_env(deployment_obj, project_path, task_id, task, model, agent_path, test_method, save_path)
+    )
+
+
+
+def test():
+    task = 'The java is not installed. Can you please setup it? Note: you are in a docker with root permission. DO NOT use sudo.'
+    project_path = '../data/benchmark/projects/test-repo'
+    test_method = 'java -version'
+    deployment = 'xuafeng/swe-go-python:latest'
+    model = 'claude-sonnet-4-5-20250929'
+    agent_path = './agents/claudecode'
+    save_path = './eval_results'
+    task_id = 'test_task_1'
+    result = run_eval(deployment, project_path, task_id, task, model, agent_path, test_method, save_path)
+    print('Test result:', result)
+
+
+# TODO: still work on add openhand agent
+def test1():
+    task = 'The java is not installed. Can you please setup it? Note: you are in a docker with root permission. DO NOT use sudo.'
+    project_path = '../data/benchmark/projects/test-repo'
+    test_method = 'java -version'
+    deployment = 'xuafeng/swe-go-python:latest'
+    model = 'claude-sonnet-4-5-20250929'
+    agent_path = './agents/openhand'
+    save_path = './eval_results'
+    task_id = 'test_task_1'
+    result = run_eval(deployment, project_path, task_id, task, model, agent_path, test_method, save_path)
+    print('Test result:', result)
+
+
+def test2():
+    task = "create a python file named hello.py that prints 'hello world'"
+    project_path = '../data/benchmark/projects/test-repo'
+    test_method = 'python hello.py'
+    deployment = 'xuafeng/swe-go-python:latest'
+    model = 'claude-sonnet-4-5-20250929'
+    agent_path = './agents/claudecode'
+    save_path = './eval_results'
+    task_id = 'test_task_1'
+    eval_out = asyncio.run(
+        run_eval_in_env(deployment, project_path, task_id, task, model, agent_path, test_method, save_path)
+    )
+    print(eval_out)
+
+
+if __name__ == '__main__':
+    test1()

From 52f995eaa9059414fb6ed819a8e638b2eee30005 Mon Sep 17 00:00:00 2001
From: Bogdan 'Bo' Stoica <bastoica@illinois.edu>
Date: Thu, 12 Feb 2026 10:18:40 +0000
Subject: [PATCH 2/7] Remove unused AE agent helper files from data/benchmark

---
 .../data/benchmark/README_ae_agent.md         | 38 -------------------
 .../data/benchmark/run_ae_agent.sh            | 16 --------
 2 files changed, 54 deletions(-)
 delete mode 100644 benchmarks/arteval_bench/data/benchmark/README_ae_agent.md
 delete mode 100755 benchmarks/arteval_bench/data/benchmark/run_ae_agent.sh

diff --git a/benchmarks/arteval_bench/data/benchmark/README_ae_agent.md b/benchmarks/arteval_bench/data/benchmark/README_ae_agent.md
deleted file mode 100644
index 40d39473..00000000
--- a/benchmarks/arteval_bench/data/benchmark/README_ae_agent.md
+++ /dev/null
@@ -1,38 +0,0 @@
-# Run ArtEval Benchmark with AE Agent
-
-This directory contains `arteval_tasks.jsonl` and other benchmark task definitions. To run the benchmark with **ae_agent**, start from the **benchmark root** (`benchmarks/arteval_bench/`).
-
-## Run from benchmark root
-
-```bash
-cd benchmarks/arteval_bench
-
-# Use ae_agent with data/benchmark/arteval_tasks.jsonl as input
-python src/main.py \
-  -i ./data/benchmark/arteval_tasks.jsonl \
-  -a ae_agent \
-  -m claude-sonnet-4-5-20250929 \
-  -o ./outputs/ae_agent_$(date +%Y-%m-%d_%H-%M-%S)
-```
-
-Or, if `run.sh` supports passing an agent argument:
-
-```bash
-cd benchmarks/arteval_bench
-./run.sh claude-sonnet-4-5-20250929 ae_agent
-```
-
-## Environment
-
-- Set `ANTHROPIC_API_KEY` or `ANTHROPIC_FOUNDRY_API_KEY`.
-- Optional: `ANTHROPIC_FOUNDRY_BASE_URL`, `CLAUDE_CODE_USE_FOUNDRY=1`.
-- The ae_agent implementation lives under `src/agents/ae_agent/`, synced with the standalone ae-agent repo (runner, install, utils, interactive_runner).
-
-## Task format
-
-Each line of `arteval_tasks.jsonl` is one JSON object, including at least:
-
-- `artifact_id`, `artifact_dir`, `artifact_readme`, `artifact_url`
-- `evaluator`: evaluation command (e.g. `cd /repo && python3 _agent_eval/main.py`)
-- `docker_env`: Docker image
-- `run_on_host`: when `true`, run on the host instead of Docker
diff --git a/benchmarks/arteval_bench/data/benchmark/run_ae_agent.sh b/benchmarks/arteval_bench/data/benchmark/run_ae_agent.sh
deleted file mode 100755
index 541f581a..00000000
--- a/benchmarks/arteval_bench/data/benchmark/run_ae_agent.sh
+++ /dev/null
@@ -1,16 +0,0 @@
-#!/bin/bash
-# Run ArtEval benchmark with ae_agent. Execute this script from the benchmark root.
-# Usage: ./run_ae_agent.sh [optional: model name, default claude-sonnet-4-5-20250929]
-
-set -e
-BENCH_ROOT="$(cd "$(dirname "$0")/../.." && pwd)"
-MODEL_NAME="${1:-claude-sonnet-4-5-20250929}"
-cd "$BENCH_ROOT"
-echo "==> ArtEval benchmark root: $BENCH_ROOT"
-echo "==> Model: $MODEL_NAME"
-echo "==> Agent: ae_agent"
-python src/main.py \
-  -i ./data/benchmark/arteval_tasks.jsonl \
-  -a ae_agent \
-  -m "$MODEL_NAME" \
-  -o "./outputs/ae_agent_${MODEL_NAME//\//_}_$(date +%Y-%m-%d_%H-%M-%S)"

From 181efc4bbbc6a8a2076f820190b66fb1c7b84688 Mon Sep 17 00:00:00 2001
From: Bogdan 'Bo' Stoica <bastoica@illinois.edu>
Date: Fri, 13 Feb 2026 00:50:14 +0000
Subject: [PATCH 3/7] arteval_bench: add host/local mode to ae_agent and unify
 task list format

- Add run_eval.py and main.py to ae_agent for running tasks on host (env=local)
  or in Docker; run_eval(env, ...) is the single entry point.
- Expand utils.py with helpers for main/run_eval (safe_task_id, env_from_item,
  resolve_project_path, Tee, write_task_report, compute_and_write_summary).
- Update ae_agent README with host mode usage and new file descriptions.
- Unify arteval_tasks.jsonl to new format: artifact_id, artifact_dir,
  artifact_readme, artifact_url, env, gpu; remove evaluator/expected_score.
- Ignore duplicate task list copies (arteval_tasks copy*.jsonl) in .gitignore.
---
 benchmarks/arteval_bench/.gitignore           |   3 +
 .../data/benchmark/arteval_tasks.jsonl        |   2 +-
 .../src/agents/ae_agent/README.md             |  28 +-
 .../arteval_bench/src/agents/ae_agent/main.py | 157 ++++
 .../src/agents/ae_agent/run_eval.py           | 697 ++++++++++++++++++
 .../src/agents/ae_agent/utils.py              | 235 +++++-
 benchmarks/arteval_bench/src/main.py          |   2 +-
 7 files changed, 1117 insertions(+), 7 deletions(-)
 create mode 100644 benchmarks/arteval_bench/src/agents/ae_agent/main.py
 create mode 100644 benchmarks/arteval_bench/src/agents/ae_agent/run_eval.py

diff --git a/benchmarks/arteval_bench/.gitignore b/benchmarks/arteval_bench/.gitignore
index 64b18d31..8de6bf48 100644
--- a/benchmarks/arteval_bench/.gitignore
+++ b/benchmarks/arteval_bench/.gitignore
@@ -117,3 +117,6 @@ a.out
 # Build directories
 build/
 cmake-build-*/
+
+# Duplicate task list copies (canonical: arteval_tasks.jsonl)
+data/benchmark/arteval_tasks copy*.jsonl
diff --git a/benchmarks/arteval_bench/data/benchmark/arteval_tasks.jsonl b/benchmarks/arteval_bench/data/benchmark/arteval_tasks.jsonl
index 1f46440a..6d02f195 100644
--- a/benchmarks/arteval_bench/data/benchmark/arteval_tasks.jsonl
+++ b/benchmarks/arteval_bench/data/benchmark/arteval_tasks.jsonl
@@ -3,4 +3,4 @@
 {"artifact_id": "sosp23_acto", "artifact_dir": "sosp23_acto", "artifact_readme": "sosp23_acto/acto/README.md", "artifact_url": "https://github.com/xlab-uiuc/acto", "evaluator": "sosp23_acto/_agent_eval/main.py", "expected_score": 4, "docer_env": "bastoica/ae-agent-ubuntu24.04:latest"}
 {"artifact_id": "eurosys25_egwalker", "artifact_dir": "eurosys25_egwalker", "artifact_readme": "eurosys25_egwalker/egwalker/README.md", "artifact_url": "https://github.com/josephg/egwalker-paper", "evaluator": "eurosys25_egwalker/_agent_eval/main.py", "expected_score": 4, "docer_env": "bastoica/ae-agent-ubuntu24.04:latest"}
 {"artifact_id": "eurosys25_depsurf", "artifact_dir": "eurosys25_depsurf", "artifact_readme": "eurosys25_depsurf/depsurf/README.md", "artifact_url": "https://github.com/ShawnZhong/DepSurf", "evaluator": "eurosys25_depsurf/_agent_eval/main.py", "expected_score": 4, "docer_env": "bastoica/ae-agent-ubuntu24.04:latest"}
-{"artifact_id": "osdi24_eet", "artifact_dir": "osdi24_eet", "artifact_readme": "osdi24_eet/eet/README.md", "artifact_url": "https://github.com/JZuming/EET", "evaluator": "osdi24_eet/_agent_eval/main.py", "expected_score": 4, "docer_env": "bastoica/ae-agent-ubuntu24.04:latest"}
\ No newline at end of file
+{"artifact_id": "osdi24_eet", "artifact_dir": "osdi24_eet", "artifact_readme": "osdi24_eet/eet/README.md", "artifact_url": "https://github.com/JZuming/EET", "evaluator": "osdi24_eet/_agent_eval/main.py", "expected_score": 4, "docer_env": "bastoica/ae-agent-ubuntu24.04:latest"}
diff --git a/benchmarks/arteval_bench/src/agents/ae_agent/README.md b/benchmarks/arteval_bench/src/agents/ae_agent/README.md
index e1a9c4f2..7ebb5acf 100644
--- a/benchmarks/arteval_bench/src/agents/ae_agent/README.md
+++ b/benchmarks/arteval_bench/src/agents/ae_agent/README.md
@@ -2,12 +2,14 @@
 
 This agent is the **ae-agent** logic integrated as a sub-agent of the system-intelligence-benchmark ArtEval benchmark. It uses the Claude Agent SDK to run artifact evaluation tasks inside the benchmark container. Code is synced from the standalone [ae-agent](https://github.com/Couen/ae-agent) repo.
 
-## Files (synced from ae-agent)
+## Files
 
 - **install.sh**: Installs `claude-agent-sdk==0.1.24` and configures `~/.claude/settings.json` (48h Bash timeout).
 - **runner.sh**: Entry point invoked as `runner.sh <model> <task_or_path>`. Forwards to `runner.py`. Uses `/agent/current_task.txt` when the benchmark passes task via file.
 - **runner.py**: Runs the task with Claude Agent SDK; supports rate-limit retry (429), message_formatter; second argument can be task text or path to file.
-- **utils.py**: `DEFAULT_TIMEOUT_MS` for the runner.
+- **run_eval.py**: Orchestration for one task: `env='local'` runs on host, otherwise runs in Docker (requires swerex/swe-rex).
+- **main.py**: CLI entry for batch runs from JSONL; supports both host and Docker per task (see “Run on host (local)” below).
+- **utils.py**: `DEFAULT_TIMEOUT_MS`, task/path helpers, Tee, reports, summary (used by runner, main, run_eval).
 - **interactive_runner.py**: Interactive multi-turn session inside container (e.g. `docker exec -it <cid> python3 /agent/interactive_runner.py <model>`).
 - **__init__.py**: Package marker.
 
@@ -37,6 +39,26 @@ The benchmark will:
 - Python 3 with `claude-agent-sdk` (installed by `install.sh`).
 - Optional: `message_formatter` for prettier output (if present in the environment).
 
+## Run on host (local)
+
+You can run tasks **on the host machine** (no Docker) from this directory:
+
+1. **Single-task / batch via main.py**  
+   Use a JSONL input where each line can set `"env": "local"` or `"run_on_host": true` to run that task on the host. Other lines without that run in Docker (if swerex is available).
+
+   ```bash
+   cd benchmarks/arteval_bench/src/agents/ae_agent
+   python main.py -i /path/to/tasks.jsonl -a ae_agent -m claude-sonnet-4-5-20250929 -o ./outputs/host_run
+   ```
+
+2. **Requirements for host mode**  
+   - `ANTHROPIC_API_KEY` or `ANTHROPIC_FOUNDRY_API_KEY` set  
+   - Docker installed and running (for prereq check; agent runs on host)  
+   - `pip install claude-agent-sdk`
+
+3. **Docker mode from this directory**  
+   If JSONL has `"env": "docker"` (or no `run_on_host`), `main.py` will run that task in Docker via `run_eval.py` (requires `swe-rex` / `swerex`).
+
 ## Relation to standalone ae-agent repo
 
-The standalone ae-agent repo provides a full CLI (`main.py`, `run_eval.py`, `utils.py`) and host/Docker orchestration. This sub-agent is the in-container runner only; the benchmark’s `run_eval_in_env.py` handles orchestration, task file upload, and Foundry env vars.
+The standalone ae-agent repo provides the same host/Docker CLI. This sub-agent includes both the **in-container** runner (used by the benchmark’s `run_eval_in_env.py`) and the **host/local** mode via `main.py` and `run_eval.py`.
diff --git a/benchmarks/arteval_bench/src/agents/ae_agent/main.py b/benchmarks/arteval_bench/src/agents/ae_agent/main.py
new file mode 100644
index 00000000..ae36ae71
--- /dev/null
+++ b/benchmarks/arteval_bench/src/agents/ae_agent/main.py
@@ -0,0 +1,157 @@
+"""Main entry point for running artifact tasks (host or Docker).
+
+Supports both:
+- Run from this directory: env=local (host) or env=docker per task in JSONL.
+- Used as in-container runner when benchmark uploads this agent to /agent.
+"""
+
+import argparse
+import json
+import os
+import sys
+from datetime import datetime
+
+from .run_eval import run_eval
+from .utils import (
+    compute_and_write_summary,
+    docker_image_from_item,
+    env_from_item,
+    get_task,
+    gpu_from_item,
+    interactive_from_item,
+    read_task_from_file,
+    resolve_project_path,
+    safe_task_id,
+    Tee,
+    timeout_ms_from_item,
+    write_task_report,
+)
+
+
+def main(input_file, model, agent, save_path, interactive_default: bool = False):
+    """Main function for running tasks."""
+    print(f'Using model: {model}, agent: {agent}')
+
+    with open(input_file) as f:
+        for line in f:
+            if not line.strip():
+                continue
+            try:
+                item = json.loads(line)
+            except json.JSONDecodeError:
+                print(f'Skipping invalid JSON line: {line}')
+                continue
+
+            env = env_from_item(item)
+            docker_image = docker_image_from_item(item)
+            use_gpu = gpu_from_item(item)
+            interactive = interactive_from_item(item) or interactive_default
+            task_file = item.get("artifact_readme", None)
+            task_id = item.get("artifact_id", None)
+            timeout_ms = timeout_ms_from_item(item)
+            safe_id = safe_task_id(task_id)
+
+            project_path, path_error = resolve_project_path(item, input_file, save_path)
+            if path_error:
+                print(path_error)
+                continue
+            print(f"Project path: {project_path}")
+
+            task = read_task_from_file(project_path, task_file) if task_file else get_task("README.md")
+            summary_basename = f'ae_summary_{safe_id}.md'
+            task = task.rstrip() + f"\n\nAt the end, write a brief summary of what you did and the result to {summary_basename} in the artifact root (so it can be included in the report)."
+
+            task_file_path = os.path.join(save_path, f'current_task_{safe_id}.txt')
+            with open(task_file_path, 'w', encoding='utf-8') as f:
+                f.write(task)
+
+            print(f"Task {task_id}: env={env}, timeout_ms={timeout_ms if timeout_ms is not None else 'default'}, gpu={use_gpu}, interactive={interactive}")
+
+            log_path = os.path.join(save_path, f'ae_log_{safe_id}.log')
+            with open(log_path, 'w', encoding='utf-8') as lf:
+                lf.write(f"Task {task_id} started at {datetime.now().isoformat()}\n")
+                lf.write(f"Project path: {project_path}\n")
+                lf.write(f"Env: {env}\n\n")
+            old_stdout, old_stderr = sys.stdout, sys.stderr
+            try:
+                with Tee(sys.stdout, log_path) as tee_out:
+                    with Tee(sys.stderr, log_path) as tee_err:
+                        sys.stdout, sys.stderr = tee_out, tee_err
+                        result = run_eval(
+                            env=env,
+                            docker_image=docker_image,
+                            project_path=project_path,
+                            task_id=task_id,
+                            task=task,
+                            model=model,
+                            agent_path=agent,
+                            save_path=save_path,
+                            timeout_ms=timeout_ms,
+                            use_gpu=use_gpu,
+                            task_file_path=task_file_path,
+                            interactive=interactive,
+                        )
+            finally:
+                sys.stdout, sys.stderr = old_stdout, old_stderr
+
+            result["timestamp"] = datetime.now().isoformat()
+            result["log_file"] = log_path
+            with open(f"{save_path}/result.jsonl", "a+", encoding="utf-8") as fw:
+                fw.write(json.dumps(result, ensure_ascii=False) + "\n")
+            with open(log_path, "a", encoding="utf-8") as lf:
+                lf.write(f"\nTask finished at {result['timestamp']}, status: {result.get('status', 'unknown')}\n")
+                lf.write("\n--- Agent run output ---\n")
+                run_out = str(result.get("agent_run_results", ""))
+                lf.write(run_out[:50000])
+                if len(run_out) > 50000:
+                    lf.write("\n... (truncated)\n")
+
+            summary_file = os.path.join(project_path, summary_basename)
+            agent_summary = ""
+            if os.path.isfile(summary_file):
+                try:
+                    with open(summary_file, "r", encoding="utf-8") as f:
+                        agent_summary = f.read()
+                except Exception:
+                    pass
+            if not agent_summary:
+                agent_summary = (str(result.get("agent_run_results", ""))[:8000] or "(No summary captured)")
+            write_task_report(save_path, safe_id, task_id, result, log_path, agent_summary)
+            print(f"Task {task_id} completed. Status: {result.get('status', 'unknown')}")
+
+    total_count, success_count = compute_and_write_summary(save_path)
+    print(f"All tasks completed: {success_count}/{total_count} succeeded.")
+
+
+def cli_main():
+    """CLI entry point."""
+    parser = argparse.ArgumentParser(description='AE Agent - Run Claude Agent SDK on artifact tasks (host or Docker)')
+    parser.add_argument('-i', '--input_file', help='Input JSONL file with tasks', default='./data/benchmark/arteval_tasks.jsonl')
+    parser.add_argument('-o', '--save_path', help='Result save path', default=None)
+    parser.add_argument('-a', '--agent', help='Agent name (default: ae_agent)', default='ae_agent')
+    parser.add_argument('-m', '--model_name', help='Model Name', default='claude-sonnet-4-5-20250929')
+    parser.add_argument('--interactive', action='store_true', help='Enable interactive mode (continue giving agent instructions after task completes)')
+    args = parser.parse_args()
+    model_name = args.model_name
+    agent = args.agent
+    input_file = args.input_file
+    save_path = args.save_path
+    if save_path is None:
+        str_model_name = model_name.replace('/', '_').lower()
+        timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
+        save_path = os.path.join('./outputs', f'ae_{str_model_name}_ae_agent_{timestamp}')
+    # When running from this directory, use it as agent path
+    if agent in ('ae-agent', 'ae_agent', 'claude_sdk'):
+        script_dir = os.path.dirname(os.path.abspath(__file__))
+        agent = script_dir
+    save_path = os.path.abspath(os.path.expanduser(save_path))
+    os.makedirs(save_path, exist_ok=True)
+    interactive_default = getattr(args, 'interactive', False)
+    print(f"Input file: {input_file}")
+    print(f"Save path: {save_path}")
+    print(f"Agent path: {agent}")
+    main(input_file, model_name, agent, save_path, interactive_default=interactive_default)
+
+
+if __name__ == '__main__':
+    cli_main()
diff --git a/benchmarks/arteval_bench/src/agents/ae_agent/run_eval.py b/benchmarks/arteval_bench/src/agents/ae_agent/run_eval.py
new file mode 100644
index 00000000..a83e8b91
--- /dev/null
+++ b/benchmarks/arteval_bench/src/agents/ae_agent/run_eval.py
@@ -0,0 +1,697 @@
+"""Runner for executing artifact tasks in Docker or on host.
+
+Single entry point: run_eval(env, project_path, task_id, task, model, agent_path, save_path, ...).
+- env='local' → run on host (internal: _run_local).
+- env != 'local' → run in Docker (internal: run_eval_in_env).
+"""
+
+import asyncio
+import json
+import os
+import shutil
+import subprocess
+import sys
+import tempfile
+from pathlib import Path
+
+from .utils import DEFAULT_TIMEOUT_MS, safe_task_id
+
+# Try to import SWE-ReX (historically called "swerex") for Docker deployment.
+SWEREX_AVAILABLE = False
+
+try:
+    from swerex.deployment.docker import DockerDeploymentConfig
+    from swerex.runtime.abstract import BashAction, Command, CreateBashSessionRequest, UploadRequest
+    SWEREX_AVAILABLE = True
+except ImportError:
+    try:
+        from swe_rex.deployment.docker import DockerDeploymentConfig
+        from swe_rex.runtime.abstract import BashAction, Command, CreateBashSessionRequest, UploadRequest
+        SWEREX_AVAILABLE = True
+    except ImportError:
+        SWEREX_AVAILABLE = False
+        print("WARNING: swerex/swe-rex not available. Docker mode will not work.", file=sys.stderr)
+
+
+def build_system_prompt(artifact_path: str, task: str) -> str:
+    """Build the system prompt for running an artifact task on the host."""
+    return f"""You are an experienced software engineer completing an artifact task.
+
+ENVIRONMENT SETUP (HOST MACHINE - NOT DOCKER):
+- You are running DIRECTLY on the host machine (NOT inside a Docker container)
+- Docker daemon is already running on this host
+- When you use Kind to create Kubernetes clusters, they will be created using the host's Docker
+- This avoids Docker-in-Docker compatibility issues
+- You may need sudo for some operations
+
+ARTIFACT LOCATION:
+- The artifact repository is located at: {artifact_path}
+- Start by changing to this directory: cd {artifact_path}
+
+YOUR TASK:
+{task}
+
+TIMEOUT CONFIGURATION (CRITICAL):
+- Long-running commands (builds, tests, Kind cluster creation) are expected
+- DO NOT set short timeouts - let commands complete naturally
+- Kind cluster creation can take 5-10 minutes
+- Full benchmark runs can take hours
+
+IMPORTANT GUIDELINES:
+1. First, cd to {artifact_path} and examine the directory structure
+2. Follow the README instructions step by step
+3. You MUST execute every verification step, test, or command that the README (or referenced docs like TESTBED.md) says is required for evaluation or reproduction. Do NOT skip any such step just because the README mentions that it may take a long time. Long runtimes are expected; run each verification and wait for completion.
+4. If you see 'sudo' in instructions, you can use it (or skip if already root)
+5. Use the Bash tool to run commands, Read tool to inspect files
+6. Work systematically through setup, build, and experiment execution
+7. If you encounter errors, debug and resolve them using available tools
+8. For Kind clusters, they will work properly since you're on the host (not DinD)"""
+
+
+def check_prerequisites_on_host() -> bool:
+    """Check that required tools (docker, python, API key) are available on the host. Returns True if OK."""
+    if not shutil.which("docker"):
+        print("ERROR: Docker is not installed on host.", file=sys.stderr)
+        return False
+    result = subprocess.run(["docker", "ps"], capture_output=True, timeout=10)
+    if result.returncode != 0:
+        print("ERROR: Docker is not running on host.", file=sys.stderr)
+        return False
+    if not os.environ.get("ANTHROPIC_API_KEY") and not os.environ.get("ANTHROPIC_FOUNDRY_API_KEY"):
+        print("ERROR: ANTHROPIC_API_KEY or ANTHROPIC_FOUNDRY_API_KEY must be set.", file=sys.stderr)
+        return False
+    return True
+
+
+def setup_claude_settings_on_host(timeout_ms: int):
+    """Set up ~/.claude/settings.json with timeout configuration on host."""
+    claude_dir = Path.home() / ".claude"
+    settings_file = claude_dir / "settings.json"
+    claude_dir.mkdir(exist_ok=True)
+    settings = {
+        "env": {
+            "BASH_MAX_TIMEOUT_MS": str(timeout_ms),
+            "BASH_DEFAULT_TIMEOUT_MS": str(timeout_ms),
+        }
+    }
+    with open(settings_file, 'w') as f:
+        json.dump(settings, f, indent=2)
+    print(f"Created {settings_file} with timeout configuration: {timeout_ms} ms.")
+
+
+async def _run_local(
+    project_path, task_id, task, model, agent_path, save_path, timeout_ms: int, *,
+    skip_prereq_check: bool = False, interactive: bool = False
+):
+    """Internal: run one task on the host (no Docker). Used by run_eval when env='local'."""
+    print("=" * 80)
+    print("Running task directly on HOST MACHINE (not in Docker)")
+    print("=" * 80)
+
+    if not skip_prereq_check and not check_prerequisites_on_host():
+        raise RuntimeError("Host prerequisites check failed (docker, ANTHROPIC_API_KEY)")
+
+    setup_claude_settings_on_host(timeout_ms)
+
+    project_path = os.path.abspath(project_path)
+    if not os.path.isdir(project_path):
+        raise RuntimeError(f"Project path does not exist: {project_path}")
+
+    print(f"Project path: {project_path}")
+    print(f"Task ID: {task_id}")
+    print(f"Model: {model}")
+
+    try:
+        from claude_agent_sdk import ClaudeAgentOptions
+        if interactive:
+            from claude_agent_sdk import ClaudeSDKClient
+        else:
+            from claude_agent_sdk import query
+    except ImportError as e:
+        raise RuntimeError(f"claude_agent_sdk not installed: {e}. Install with: pip install claude-agent-sdk")
+
+    system_prompt = build_system_prompt(project_path, task)
+    options = ClaudeAgentOptions(
+        system_prompt=system_prompt,
+        allowed_tools=["Read", "Write", "Bash"],
+        setting_sources=["user"],
+    )
+
+    os.environ['BASH_MAX_TIMEOUT_MS'] = str(timeout_ms)
+    os.environ['BASH_DEFAULT_TIMEOUT_MS'] = str(timeout_ms)
+
+    message_count = 0
+    run_results_output = ""
+
+    if interactive:
+        print("Starting Claude Agent SDK (Host Mode, Interactive)...")
+        async with ClaudeSDKClient(options=options) as client:
+            await client.query(
+                f"Please start the artifact task. Begin by changing to the artifact directory at {project_path} and examining its contents."
+            )
+            async for message in client.receive_response():
+                message_count += 1
+                if message_count % 10 == 0:
+                    print(f"[Progress] Processed {message_count} messages...")
+                msg_str = str(message)
+                print(msg_str)
+                if 'ResultMessage' in msg_str or 'TextBlock' in msg_str:
+                    run_results_output = msg_str
+
+            print(f"Claude Agent SDK execution completed. Total messages: {message_count}")
+            print("\n" + "=" * 60)
+            print("Interactive mode - Type your follow-up instructions (or 'quit'/'exit' to end).")
+            print("=" * 60 + "\n")
+
+            while True:
+                try:
+                    user_input = input("\n>>> ").strip()
+                except (EOFError, KeyboardInterrupt):
+                    print("\nExiting interactive mode.", flush=True)
+                    break
+                if not user_input:
+                    continue
+                if user_input.lower() in ('quit', 'exit', 'q'):
+                    print("Exiting interactive mode.", flush=True)
+                    break
+
+                await client.query(user_input)
+                async for msg in client.receive_response():
+                    msg_str = str(msg)
+                    print(msg_str)
+                    if 'ResultMessage' in msg_str or 'TextBlock' in msg_str:
+                        run_results_output = msg_str
+    else:
+        print("Starting Claude Agent SDK (Host Mode)...")
+        try:
+            async for message in query(
+                prompt=f"Please start the artifact task. Begin by changing to the artifact directory at {project_path} and examining its contents.",
+                options=options
+            ):
+                message_count += 1
+                if message_count % 10 == 0:
+                    print(f"[Progress] Processed {message_count} messages...")
+                msg_str = str(message)
+                print(msg_str)
+                if 'ResultMessage' in msg_str or 'TextBlock' in msg_str:
+                    run_results_output = msg_str
+
+            print(f"Claude Agent SDK execution completed. Total messages: {message_count}")
+
+        except Exception as e:
+            print(f"ERROR: Claude Agent SDK execution failed: {e}")
+            import traceback
+            traceback.print_exc()
+            run_results_output = f"Error: {e}"
+
+    result = {
+        'task_id': task_id,
+        'task': task,
+        'project_path': project_path,
+        'agent_run_results': run_results_output,
+        'message_count': message_count,
+        'status': 'success' if message_count > 0 else 'error',
+        'run_on_host': True,
+        'container_id': None,
+        'saved_image': None,
+        'container_stopped': False,
+    }
+
+    return result
+
+
+def _save_container_as_image(container_id: str, project_path: str, task_id: str) -> tuple[str | None, bool]:
+    """Save Docker container as image (docker cp, commit, stop). Returns (saved_image_tag or None, container_stopped)."""
+    project_path_abs = os.path.abspath(project_path)
+    if os.path.isdir(project_path_abs):
+        try:
+            cp_proc = subprocess.run(
+                ["docker", "cp", f"{container_id}:/repo/.", project_path_abs],
+                capture_output=True,
+                text=True,
+                timeout=600,
+            )
+            if cp_proc.returncode == 0:
+                print(f"Synced container /repo to host workspace: {project_path_abs}")
+            else:
+                print(
+                    f"WARNING: docker cp failed (container {container_id} -> {project_path_abs}): "
+                    f"{cp_proc.stderr.strip()}"
+                )
+        except subprocess.TimeoutExpired:
+            print(f"WARNING: docker cp timed out copying /repo from container {container_id}")
+        except Exception as e:
+            print(f"WARNING: Exception during docker cp from container {container_id}: {e}")
+    else:
+        print(f"WARNING: project_path does not exist, skipping workspace sync: {project_path_abs}")
+
+    sid = safe_task_id(task_id, fallback="unknown_task")
+    saved_image = f"ae-agent-{sid.lower()}:latest"
+    try:
+        commit_proc = subprocess.run(
+            ["docker", "commit", container_id, saved_image],
+            capture_output=True,
+            text=True,
+            timeout=600,
+        )
+        if commit_proc.returncode == 0:
+            print(f"Saved container {container_id} as image '{saved_image}'.")
+        else:
+            print(
+                f"WARNING: docker commit failed for container {container_id}: "
+                f"{commit_proc.stderr.strip()}"
+            )
+            saved_image = None
+    except Exception as e:
+        print(f"WARNING: Exception during docker commit for container {container_id}: {e}")
+        saved_image = None
+
+    container_stopped = False
+    try:
+        stop_proc = subprocess.run(
+            ["docker", "stop", container_id],
+            capture_output=True,
+            text=True,
+            timeout=60,
+        )
+        if stop_proc.returncode == 0:
+            print(f"Stopped container {container_id}.")
+            container_stopped = True
+        else:
+            print(
+                f"WARNING: docker stop failed for container {container_id}: "
+                f"{stop_proc.stderr.strip()}"
+            )
+    except Exception as e:
+        print(f"WARNING: Exception during docker stop for container {container_id}: {e}")
+
+    return (saved_image, container_stopped)
+
+
+def _validate_agent_path(agent_path: str) -> None:
+    """Ensure agent_path exists and has required files. Raises RuntimeError if invalid."""
+    if not agent_path or not os.path.isdir(agent_path):
+        raise RuntimeError(f"Agent path does not exist or is not a directory: {agent_path}")
+    required = ["runner.sh", "runner.py", "install.sh"]
+    for f in required:
+        p = os.path.join(agent_path, f)
+        if not os.path.isfile(p):
+            raise RuntimeError(f"Agent path missing required file: {f} (expected at {p})")
+
+
+async def run_eval_in_env(
+    deployment, project_path, task_id, task, model, agent_path, save_path,
+    task_file_path: str | None = None, interactive: bool = False
+):
+    """Run task in Docker container."""
+    if not SWEREX_AVAILABLE:
+        raise RuntimeError("swerex is not available. Cannot run in Docker mode.")
+
+    _validate_agent_path(agent_path)
+
+    await deployment.start()
+    runtime = deployment.runtime
+
+    timeout_ms_env = os.environ.get("BASH_MAX_TIMEOUT_MS")
+    try:
+        timeout_s = float(timeout_ms_env) / 1000.0 if timeout_ms_env else (DEFAULT_TIMEOUT_MS / 1000.0)
+    except (ValueError, TypeError):
+        timeout_s = DEFAULT_TIMEOUT_MS / 1000.0
+
+    if hasattr(runtime, "_config"):
+        print(f"Current RemoteRuntime timeout: {runtime._config.timeout}s")
+        runtime._config.timeout = timeout_s
+        print(f"Overriding RemoteRuntime timeout to {timeout_s}s based on BASH_MAX_TIMEOUT_MS")
+
+    await runtime.create_session(CreateBashSessionRequest())
+
+    print('Uploading project files...')
+    await runtime.upload(
+        UploadRequest(
+            source_path=project_path,
+            target_path='/repo',
+        )
+    )
+    print('Project files uploaded.')
+
+    is_ae_agent = 'ae_agent' in str(agent_path) or str(agent_path).endswith('claude_sdk')
+
+    await runtime.run_in_session(BashAction(command='cd /repo'))
+    pwd_result = await runtime.run_in_session(BashAction(command='pwd'))
+    print(f'Current directory: {pwd_result}')
+    ls_result = await runtime.run_in_session(BashAction(command='ls'))
+    print(f'Current directory contents: {ls_result}')
+
+    print('Uploading agent runner script...')
+    await runtime.upload(
+        UploadRequest(
+            source_path=agent_path,
+            target_path='/agent',
+        )
+    )
+    print('Agent runner script uploaded.')
+
+    print('Setup the agent running environment...')
+    await runtime.run_in_session(BashAction(command='chmod +x /agent/runner.sh /agent/install.sh 2>/dev/null; /agent/install.sh'))
+
+    if task_file_path and os.path.isfile(task_file_path):
+        tmpdir = tempfile.mkdtemp(prefix='ae_agent_task_')
+        try:
+            dest = os.path.join(tmpdir, 'current_task.txt')
+            shutil.copy2(task_file_path, dest)
+            await runtime.upload(UploadRequest(source_path=tmpdir, target_path='/agent_task_file'))
+            await runtime.run_in_session(BashAction(command='cp /agent_task_file/current_task.txt /agent/current_task.txt', timeout=10.0))
+        finally:
+            shutil.rmtree(tmpdir, ignore_errors=True)
+    else:
+        tmpdir = tempfile.mkdtemp(prefix='ae_agent_task_')
+        try:
+            task_file_host = os.path.join(tmpdir, 'current_task.txt')
+            with open(task_file_host, 'w', encoding='utf-8') as f:
+                f.write(task)
+            await runtime.upload(UploadRequest(source_path=tmpdir, target_path='/agent_task_file'))
+            await runtime.run_in_session(BashAction(command='cp /agent_task_file/current_task.txt /agent/current_task.txt', timeout=10.0))
+        finally:
+            shutil.rmtree(tmpdir, ignore_errors=True)
+
+    if timeout_ms_env:
+        set_timeout_cmd = (
+            f"export BASH_MAX_TIMEOUT_MS='{timeout_ms_env}' && "
+            f"export BASH_DEFAULT_TIMEOUT_MS='{timeout_ms_env}'"
+        )
+        print(f"Setting BASH_MAX_TIMEOUT_MS/BASH_DEFAULT_TIMEOUT_MS in container to {timeout_ms_env} ms...")
+        await runtime.run_in_session(BashAction(command=set_timeout_cmd))
+
+    if is_ae_agent:
+        parts = []
+        anthropic_api_key = os.environ.get('ANTHROPIC_API_KEY')
+        foundry_api_key = os.environ.get('ANTHROPIC_FOUNDRY_API_KEY')
+        if anthropic_api_key:
+            escaped_key = anthropic_api_key.replace("'", "'\"'\"'")
+            parts.append(f"export ANTHROPIC_API_KEY='{escaped_key}'")
+        if foundry_api_key:
+            escaped_foundry_key = foundry_api_key.replace("'", "'\"'\"'")
+            parts.append(f"export ANTHROPIC_FOUNDRY_API_KEY='{escaped_foundry_key}'")
+            if not anthropic_api_key:
+                parts.append(f"export ANTHROPIC_API_KEY='{escaped_foundry_key}'")
+        foundry_base = os.environ.get('ANTHROPIC_FOUNDRY_BASE_URL')
+        if foundry_base:
+            escaped_url = foundry_base.replace("'", "'\"'\"'")
+            parts.append(f"export ANTHROPIC_FOUNDRY_BASE_URL='{escaped_url}'")
+        if os.environ.get('CLAUDE_CODE_USE_FOUNDRY') == '1':
+            parts.append("export CLAUDE_CODE_USE_FOUNDRY=1")
+        if parts:
+            set_env_cmd = " && ".join(parts)
+            print('Setting Anthropic/Foundry API key and env in container...')
+            await runtime.run_in_session(BashAction(command=set_env_cmd))
+        if not anthropic_api_key and not foundry_api_key:
+            print('WARNING: Neither ANTHROPIC_API_KEY nor ANTHROPIC_FOUNDRY_API_KEY found on host. Runner may fail.')
+
+    print('Running runner script...')
+    runner_timeout = timeout_s if is_ae_agent else min(timeout_s, 1200.0)
+
+    container_id_early = None
+    try:
+        container_id_res = await runtime.run_in_session(
+            BashAction(
+                command='cat /etc/hostname 2>/dev/null || hostname 2>/dev/null || echo "unknown"',
+                timeout=10.0,
+            )
+        )
+        container_id_early = str(getattr(container_id_res, "output", "")).strip()
+        if container_id_early == "unknown":
+            container_id_early = None
+    except Exception as e:
+        print(f"WARNING: Failed to get container id early (will retry after runner): {e}")
+
+    try:
+        if is_ae_agent:
+            await runtime.run_in_session(BashAction(command='rm -f /agent/runner.live.log && touch /agent/runner.live.log', timeout=10.0))
+
+            start_cmd = (
+                'stdbuf -oL -eL /agent/runner.sh "' + model + '" /agent/current_task.txt > /agent/runner.live.log 2>&1 & '
+                'RUNNER_PID=$!; '
+                'sleep 1; '
+                'echo RUNNER_PID=$RUNNER_PID'
+            )
+            start_res = await runtime.run_in_session(BashAction(command=start_cmd, timeout=30.0))
+            start_output = str(getattr(start_res, "output", "")).strip()
+
+            pid = None
+            for line in start_output.split('\n'):
+                if 'RUNNER_PID=' in line:
+                    pid = line.split('RUNNER_PID=', 1)[1].strip()
+                    break
+
+            if not pid or not pid.isdigit():
+                await asyncio.sleep(2)
+                ps_res = await runtime.run_in_session(
+                    BashAction(command="ps aux | grep '[r]unner.py' | awk '{print $2}' | head -1", timeout=10.0)
+                )
+                pid = str(getattr(ps_res, "output", "")).strip()
+
+            print(f'ae-agent runner started with pid: {pid}')
+            await asyncio.sleep(2)
+
+            elapsed = 0.0
+            poll_interval = 10.0
+            run_results = None
+            last_log_content = ""
+
+            while elapsed < runner_timeout:
+                try:
+                    log_res = await runtime.run_in_session(
+                        BashAction(command='cat /agent/runner.live.log 2>/dev/null || echo ""', timeout=30.0)
+                    )
+                    current_log_content = str(getattr(log_res, "output", "")).strip()
+
+                    if current_log_content and current_log_content != last_log_content:
+                        if last_log_content and current_log_content.startswith(last_log_content):
+                            new_content = current_log_content[len(last_log_content):].strip()
+                            if new_content:
+                                print(f'[ae-agent live log @ {elapsed:.0f}s ({elapsed/60:.1f} min)]\n{new_content}')
+                        else:
+                            print(f'[ae-agent live log @ {elapsed:.0f}s ({elapsed/60:.1f} min)]\n{current_log_content}')
+                        last_log_content = current_log_content
+                    elif elapsed % 300 == 0 and elapsed > 0:
+                        print(f'[ae-agent still running @ {elapsed:.0f}s ({elapsed/60:.1f} min), no new output]')
+                except Exception as e:
+                    print(f'Failed to read ae-agent live log: {e}')
+
+                if pid and pid.isdigit():
+                    ps_res = await runtime.run_in_session(
+                        BashAction(command=f'ps -p {pid} >/dev/null 2>&1; echo $?', timeout=10.0)
+                    )
+                    ps_code = str(getattr(ps_res, "output", "")).strip()
+                    if ps_code != "0":
+                        wait_res = await runtime.run_in_session(
+                            BashAction(command=f'wait {pid} 2>/dev/null; echo $?', timeout=30.0)
+                        )
+                        exit_code_str = str(getattr(wait_res, "output", "")).strip()
+                        class MockResult:
+                            def __init__(self, code):
+                                self.exit_code = int(code) if code.isdigit() else 0
+                                self.output = f'exit_code={self.exit_code}'
+                        run_results = MockResult(exit_code_str)
+                        print(f'ae-agent runner finished with exit code: {run_results.exit_code}')
+                        break
+                else:
+                    ps_res = await runtime.run_in_session(
+                        BashAction(command="ps aux | grep '[r]unner.py' | wc -l", timeout=10.0)
+                    )
+                    proc_count = str(getattr(ps_res, "output", "")).strip()
+                    if proc_count == "0" or not proc_count.isdigit() or int(proc_count) == 0:
+                        print('ae-agent runner process not found, assuming finished')
+                        class MockResult:
+                            def __init__(self):
+                                self.exit_code = 0
+                                self.output = 'exit_code=0'
+                        run_results = MockResult()
+                        break
+
+                await asyncio.sleep(poll_interval)
+                elapsed += poll_interval
+
+            if run_results is None:
+                if pid and pid.isdigit():
+                    try:
+                        await runtime.run_in_session(BashAction(command=f'kill -TERM {pid} 2>/dev/null || kill -9 {pid} 2>/dev/null || true', timeout=10.0))
+                    except Exception:
+                        pass
+                try:
+                    tail_log = await runtime.run_in_session(
+                        BashAction(command='tail -n 200 /agent/runner.live.log', timeout=30.0)
+                    )
+                    print(f'ae-agent live log tail (on timeout):\n{tail_log}')
+                except Exception as e:
+                    print(f'Failed to read ae-agent live log after timeout: {e}')
+                raise TimeoutError(f'ae-agent runner exceeded timeout {runner_timeout}s')
+
+        else:
+            runner_cmd = '/agent/runner.sh "' + model + '" /agent/current_task.txt'
+            run_results = await runtime.run_in_session(BashAction(command=runner_cmd, timeout=runner_timeout))
+
+        print(f"agent's run results: {run_results}")
+        print('Runner script finished.')
+
+        result = {
+            'task_id': task_id,
+            'task': task,
+            'project_path': project_path,
+            'agent_run_results': run_results.output if hasattr(run_results, 'output') else str(run_results),
+            'status': 'success' if (hasattr(run_results, 'exit_code') and run_results.exit_code == 0) else 'error',
+            'run_on_host': False,
+        }
+
+        container_id = container_id_early
+        if not container_id or container_id == "unknown":
+            try:
+                container_id_res = await runtime.run_in_session(
+                    BashAction(
+                        command='cat /etc/hostname 2>/dev/null || hostname 2>/dev/null || echo "unknown"',
+                        timeout=10.0,
+                    )
+                )
+                container_id = str(getattr(container_id_res, "output", "")).strip()
+            except Exception as e:
+                print(f"WARNING: Failed to get container id from inside container: {e}")
+
+        saved_image = None
+        container_stopped = False
+
+        if interactive and container_id and container_id != "unknown":
+            print("\n" + "=" * 60)
+            print("Interactive mode - Attaching to container. Type instructions (or 'quit'/'exit' to end).")
+            print("=" * 60 + "\n")
+            try:
+                proc = subprocess.run(
+                    ["docker", "exec", "-it", container_id, "python3", "/agent/interactive_runner.py", model],
+                    stdin=sys.stdin,
+                    stdout=sys.stdout,
+                    stderr=sys.stderr,
+                )
+                if proc.returncode != 0:
+                    print(f"Interactive session exited with code {proc.returncode}", file=sys.stderr)
+            except Exception as e:
+                print(f"WARNING: Interactive mode failed: {e}", file=sys.stderr)
+
+        if container_id and container_id != "unknown":
+            print(f"Preparing to save Docker container {container_id} as an image and stop it...")
+            saved_image, container_stopped = _save_container_as_image(container_id, project_path, task_id)
+
+        try:
+            await deployment.stop()
+        except Exception as e:
+            print(f"WARNING: Failed to stop deployment cleanly: {e}")
+
+        result['container_id'] = container_id
+        result['saved_image'] = saved_image
+        result['container_stopped'] = container_stopped
+
+        return result
+
+    except Exception as e:
+        print(f"Task ended with error: {e}")
+        result = {
+            'task_id': task_id,
+            'task': task,
+            'project_path': project_path,
+            'agent_run_results': str(e),
+            'status': 'error',
+            'run_on_host': False,
+            'container_id': None,
+            'saved_image': None,
+            'container_stopped': False,
+        }
+        if container_id_early and container_id_early != "unknown":
+            print("Attempting to save container as image (abnormal exit path)...")
+            try:
+                saved_img, stopped = _save_container_as_image(container_id_early, project_path, task_id)
+                result['container_id'] = container_id_early
+                result['saved_image'] = saved_img
+                result['container_stopped'] = stopped
+            except Exception as save_e:
+                print(f"WARNING: Failed to save image on abnormal exit: {save_e}")
+        try:
+            await deployment.stop()
+        except Exception as stop_e:
+            print(f"WARNING: Failed to stop deployment cleanly: {stop_e}")
+        return result
+
+
+def run_eval(
+    env: str,
+    project_path: str,
+    task_id: str,
+    task: str,
+    model: str,
+    agent_path: str,
+    save_path: str,
+    docker_image: str | None = None,
+    timeout_ms: int | None = None,
+    *,
+    skip_prereq_check: bool = False,
+    use_gpu: bool = False,
+    task_file_path: str | None = None,
+    interactive: bool = False,
+):
+    """Run task in the given environment: local (host) or docker.
+
+    Single entry point for one-task execution. Call this from main.
+
+    Args:
+        env: 'local' = run on host; otherwise run in Docker (value can be image name).
+        project_path: Path to the artifact project
+        task_id: Task identifier
+        task: Task description (used when task_file_path is None)
+        model: Model name
+        agent_path: Path to agent scripts
+        save_path: Path to save results
+        docker_image: Docker image (used when env != 'local'); default if None.
+        timeout_ms: Optional total timeout in milliseconds for this task
+        skip_prereq_check: If True (host only), skip docker/API-key check before running
+        use_gpu: If True (Docker only), pass host GPU into container via --gpus all
+        task_file_path: If set, upload this file as task (avoids passing large string)
+        interactive: If True, after task completes user can continue giving agent instructions
+    """
+    if timeout_ms is None:
+        timeout_ms = DEFAULT_TIMEOUT_MS
+    os.environ["BASH_MAX_TIMEOUT_MS"] = str(timeout_ms)
+    os.environ["BASH_DEFAULT_TIMEOUT_MS"] = str(timeout_ms)
+
+    if str(env).strip().lower() == "local":
+        print(f"Task {task_id} configured to run on HOST (env=local, timeout_ms={timeout_ms}, interactive={interactive})")
+        return asyncio.run(
+            _run_local(
+                project_path, task_id, task, model, agent_path, save_path, timeout_ms,
+                skip_prereq_check=skip_prereq_check, interactive=interactive,
+            )
+        )
+
+    if not SWEREX_AVAILABLE:
+        raise RuntimeError(
+            "SWE-ReX (swerex) is not available. Install swe-rex for Docker mode."
+        )
+    image = docker_image or 'bastoica/ae-agent-ubuntu24.04:latest'
+    docker_args = [
+        '--privileged',
+        '--cgroupns=host',
+        '-e', 'KIND_EXPERIMENTAL_CONTAINERD_SNAPSHOTTER=native',
+    ]
+    if use_gpu:
+        docker_args.extend(['--gpus', 'all'])
+    config = DockerDeploymentConfig(
+        image=image,
+        startup_timeout=1200.0,
+        docker_args=docker_args,
+    )
+    deployment_obj = config.get_deployment()
+    gpu_note = " (GPU enabled)" if use_gpu else ""
+    interactive_note = " (interactive)" if interactive else ""
+    print(f"Task {task_id} configured to run in DOCKER (image={image}, timeout_ms={timeout_ms}){gpu_note}{interactive_note}")
+    return asyncio.run(
+        run_eval_in_env(
+            deployment_obj, project_path, task_id, task, model, agent_path, save_path,
+            task_file_path=task_file_path, interactive=interactive,
+        )
+    )
diff --git a/benchmarks/arteval_bench/src/agents/ae_agent/utils.py b/benchmarks/arteval_bench/src/agents/ae_agent/utils.py
index fda8c0a6..b419804e 100644
--- a/benchmarks/arteval_bench/src/agents/ae_agent/utils.py
+++ b/benchmarks/arteval_bench/src/agents/ae_agent/utils.py
@@ -1,4 +1,235 @@
-"""Helper for AE Agent runner (timeout constant used by runner.py)."""
+"""Helper for AE Agent runner and host/Docker orchestration (main.py, run_eval.py)."""
 
-# Default total timeout in milliseconds (48h); used by runner when BASH_MAX_TIMEOUT_MS is unset.
+import json
+import os
+import subprocess
+
+# Default total timeout in milliseconds (48h); used by runner.py and run_eval.
 DEFAULT_TIMEOUT_MS = 172_800_000
+
+
+def interactive_from_item(item: dict) -> bool:
+    """Whether to enable interactive mode (user can continue giving agent instructions after task completes)."""
+    v = item.get("interactive", False)
+    if isinstance(v, bool):
+        return v
+    if isinstance(v, str):
+        return v.strip().lower() in ("true", "1", "yes")
+    return bool(v)
+
+
+def safe_task_id(task_id: str | None, fallback: str = "unknown") -> str:
+    """Normalize task_id for use in filenames (no spaces, lowercase)."""
+    return (task_id or fallback).replace(" ", "_").lower()
+
+
+def timeout_ms_from_item(item: dict) -> int | None:
+    """Parse timeout from task item. Returns ms (int) or None for default."""
+    v = item.get("timeout", None)
+    if v is None:
+        return None
+    if isinstance(v, (int, float)):
+        return int(v * 1000) if v < 1_000_000 else int(v)
+    return None
+
+
+def env_from_item(item: dict) -> str:
+    """Resolve env from task item: 'local' = host, else = docker. Backward compat: run_on_host/docker_env."""
+    env = item.get("env", None)
+    if env is not None:
+        s = str(env).strip().lower()
+        return "local" if s == "local" else (str(env).strip() or "docker")
+    return "local" if item.get("run_on_host", False) else "docker"
+
+
+def gpu_from_item(item: dict) -> bool:
+    """Whether to enable GPU access in Docker. Default False (no host GPU passed to container)."""
+    v = item.get("gpu", False)
+    if isinstance(v, bool):
+        return v
+    if isinstance(v, str):
+        return v.strip().lower() in ("true", "1", "yes")
+    return bool(v)
+
+
+def docker_image_from_item(
+    item: dict,
+    default: str = "bastoica/ae-agent-ubuntu24.04:latest",
+) -> str | None:
+    """Resolve Docker image from task item. Returns None when env is local."""
+    if env_from_item(item) == "local":
+        return None
+    env = item.get("env", None)
+    if env is not None:
+        s = str(env).strip()
+        if s and s.lower() != "local":
+            return s
+    return item.get("docker_env", None) or item.get("docer_env", None) or default
+
+
+def get_task(file_path: str) -> str:
+    """Get agent task from a file path.
+
+    Args:
+        file_path: Path to README or task description file (relative to artifact root)
+
+    Returns:
+        Task description string for the agent
+    """
+    task = (
+        f"You are an experienced software engineer."
+        + f" You are asked to navigate to the {file_path} and follow step-by-step"
+        + f" instructions to set up, install, compile, and reproduce the results in"
+        + f" that code repository. You have root access inside a Docker image, which"
+        + f" means you can directly proceed with executing the steps in the README"
+        + f" without asking for approval or confirmation. Once you reached the end"
+        + f" of the README you must exit the Docker image gracefully."
+    )
+    return task
+
+
+def read_task_from_file(artifact_path: str, task_file: str) -> str:
+    """Read task description from a file.
+
+    Args:
+        artifact_path: Path to artifact root directory
+        task_file: Relative path to task file (e.g., README.md)
+
+    Returns:
+        Content of the task file as string
+    """
+    task_file_path = os.path.join(artifact_path, task_file)
+    if os.path.exists(task_file_path):
+        with open(task_file_path, 'r', encoding='utf-8') as f:
+            return f.read()
+    else:
+        return get_task(task_file)
+
+
+def clone_artifact_repo(artifact_url: str, target_dir: str) -> str:
+    """Clone artifact repository from URL into target_dir."""
+    if os.path.exists(target_dir) and os.listdir(target_dir):
+        return target_dir
+    if os.path.exists(target_dir):
+        os.rmdir(target_dir)
+    r = subprocess.run(
+        ["git", "clone", "--depth", "1", artifact_url, target_dir],
+        capture_output=True,
+        text=True,
+        timeout=600,
+    )
+    if r.returncode != 0:
+        raise RuntimeError(f"git clone failed: {r.stderr or r.stdout}")
+    return target_dir
+
+
+def resolve_project_path(item: dict, input_file: str, save_path: str) -> tuple[str | None, str | None]:
+    """Resolve artifact project path from task item.
+
+    Returns:
+        (project_path, error_message). If error_message is not None, skip task.
+    """
+    input_dir = os.path.dirname(os.path.abspath(input_file))
+    artifact_dir = item.get("artifact_dir")
+    artifact_url = item.get("artifact_url")
+    task_id = item.get("artifact_id")
+    sid = safe_task_id(task_id)
+
+    if artifact_url:
+        candidate = os.path.join(input_dir, artifact_dir) if artifact_dir else None
+        if candidate and os.path.isdir(candidate):
+            return os.path.abspath(candidate), None
+        workspace_dir = os.path.join(save_path, "workspace", sid)
+        os.makedirs(os.path.dirname(workspace_dir), exist_ok=True)
+        return clone_artifact_repo(artifact_url, workspace_dir), None
+    if not artifact_dir:
+        return None, f"Skipping task {task_id}: missing artifact_dir and artifact_url"
+    path = os.path.abspath(os.path.join(input_dir, artifact_dir))
+    if not os.path.isdir(path):
+        return None, f"Project path does not exist: {path}"
+    return path, None
+
+
+class Tee:
+    """Write to both original stream and a log file."""
+
+    def __init__(self, stream, log_path: str):
+        self._stream = stream
+        self._path = log_path
+        self._file = None
+
+    def __enter__(self):
+        self._file = open(self._path, "a", encoding="utf-8")
+        return self
+
+    def __exit__(self, *args):
+        if self._file:
+            self._file.close()
+
+    def write(self, data):
+        self._stream.write(data)
+        if self._file:
+            self._file.write(data)
+            self._file.flush()
+
+    def flush(self):
+        self._stream.flush()
+        if self._file:
+            self._file.flush()
+
+
+def write_task_report(
+    save_path: str,
+    safe_id: str,
+    task_id: str,
+    result: dict,
+    log_path: str,
+    agent_summary: str,
+) -> None:
+    """Write ae_report_<safe_id>.md for a single task."""
+    report_path = os.path.join(save_path, f"ae_report_{safe_id}.md")
+    saved_image = result.get("saved_image")
+    with open(report_path, "w", encoding="utf-8") as fw:
+        fw.write(f"# AE Report: {task_id}\n\n")
+        fw.write(f"- **Status**: {result.get('status', 'unknown')}\n")
+        fw.write(f"- **Timestamp**: {result.get('timestamp', '')}\n")
+        fw.write(f"- **Project path**: {result.get('project_path', '')}\n")
+        fw.write(f"- **Run on host**: {result.get('run_on_host', False)}\n")
+        fw.write(f"- **Log file**: `{log_path}`\n\n")
+        if saved_image:
+            fw.write("> [!Note]\n")
+            fw.write("> ## To check the result\n")
+            fw.write(">\n")
+            fw.write("> You can run the following command to manually check the result:\n")
+            fw.write(">\n")
+            fw.write("> ```bash\n")
+            fw.write(f"> docker run -it {saved_image} bash\n")
+            fw.write("> ```\n")
+            fw.write(">\n")
+            fw.write(f"> Image: `{saved_image}`\n\n")
+        fw.write("## Agent summary\n\n")
+        fw.write(agent_summary)
+        fw.write("\n")
+
+
+def compute_and_write_summary(save_path: str) -> tuple[int, int]:
+    """Read result.jsonl, compute total/success, write summary.json. Returns (total_count, success_count)."""
+    result_path = os.path.join(save_path, "result.jsonl")
+    total, success = 0, 0
+    if os.path.isfile(result_path):
+        with open(result_path, encoding="utf-8") as f:
+            for line in f:
+                if not line.strip():
+                    continue
+                try:
+                    row = json.loads(line.strip())
+                    total += 1
+                    if row.get("status") == "success":
+                        success += 1
+                except json.JSONDecodeError:
+                    continue
+    rate = success / total if total > 0 else 0.0
+    summary = {"total_tasks": total, "successful_tasks": success, "success_rate": rate}
+    with open(os.path.join(save_path, "summary.json"), "w", encoding="utf-8") as f:
+        json.dump(summary, f, indent=4)
+    return total, success
diff --git a/benchmarks/arteval_bench/src/main.py b/benchmarks/arteval_bench/src/main.py
index df3e6c49..43c7390a 100644
--- a/benchmarks/arteval_bench/src/main.py
+++ b/benchmarks/arteval_bench/src/main.py
@@ -30,7 +30,7 @@ def main(file_path, model, agent, save_path):
                 logger.info(f'Skipping invalid JSON line: {line}')
                 continue
 
-            deployment = item.get('docker_env', None)
+            deployment = item.get('docker_env', None) or item.get('docer_env', None)
             project_path = f"./data/benchmark/{item.get('artifact_dir', None)}"
             task_file = item.get('artifact_readme', None)
             task_id = item.get('artifact_id', None)

From 2c3af40bce4987b89a7dc6717414f4f5db953c75 Mon Sep 17 00:00:00 2001
From: Bogdan 'Bo' Stoica <bastoica@illinois.edu>
Date: Wed, 25 Feb 2026 05:57:06 +0000
Subject: [PATCH 4/7] Integrate ae_agent into ArtEval benchmark with evaluation
 flow and smoke test

- Add ae_agent under benchmarks/arteval_bench/src/agents/ae_agent (main, run_eval, runner, utils, runner.sh, install.sh)
- Wire benchmark main.py and run_eval_in_env.py for ae_agent: host path runs agent then evaluator, parses score; Docker path uses same flow
- Add src/utils.py re-export for get_task when running from benchmark root
- SDK utils: do not overwrite existing env vars when loading env.toml (preserve API key)
- Add minimal smoke test: ae_agent_smoke artifact, ae_agent_smoke_test.jsonl (host + docker), run_ae_agent_smoke_test.sh
- Remove interactive_runner.py (interactive handled in runner)
- Use English throughout (docs, comments); ruff-compliant; single _make_eval_result for result shape
---
 .../data/benchmark/ae_agent_smoke/README.md   |   13 +
 .../ae_agent_smoke/README_SMOKE_TEST.md       |   44 +
 .../ae_agent_smoke/_agent_eval/check.py       |   22 +
 .../data/benchmark/ae_agent_smoke_test.jsonl  |    2 +
 .../arteval_bench/run_ae_agent_smoke_test.sh  |   21 +
 .../src/agents/ae_agent/README.md             |   56 +-
 .../src/agents/ae_agent/__init__.py           |   23 +-
 .../src/agents/ae_agent/install.sh            |   16 +-
 .../src/agents/ae_agent/interactive_runner.py |  105 --
 .../arteval_bench/src/agents/ae_agent/main.py |  324 ++--
 .../src/agents/ae_agent/run_eval.py           | 1374 ++++++++++-------
 .../src/agents/ae_agent/runner.py             |  570 ++++---
 .../src/agents/ae_agent/runner.sh             |   16 +-
 .../src/agents/ae_agent/utils.py              |  334 +++-
 benchmarks/arteval_bench/src/main.py          |   12 +-
 .../arteval_bench/src/run_eval_in_env.py      |   91 +-
 benchmarks/arteval_bench/src/utils.py         |    4 +
 sdk/utils.py                                  |   25 +-
 18 files changed, 1926 insertions(+), 1126 deletions(-)
 create mode 100644 benchmarks/arteval_bench/data/benchmark/ae_agent_smoke/README.md
 create mode 100644 benchmarks/arteval_bench/data/benchmark/ae_agent_smoke/README_SMOKE_TEST.md
 create mode 100644 benchmarks/arteval_bench/data/benchmark/ae_agent_smoke/_agent_eval/check.py
 create mode 100644 benchmarks/arteval_bench/data/benchmark/ae_agent_smoke_test.jsonl
 create mode 100755 benchmarks/arteval_bench/run_ae_agent_smoke_test.sh
 delete mode 100644 benchmarks/arteval_bench/src/agents/ae_agent/interactive_runner.py
 create mode 100644 benchmarks/arteval_bench/src/utils.py

diff --git a/benchmarks/arteval_bench/data/benchmark/ae_agent_smoke/README.md b/benchmarks/arteval_bench/data/benchmark/ae_agent_smoke/README.md
new file mode 100644
index 00000000..c6b0c758
--- /dev/null
+++ b/benchmarks/arteval_bench/data/benchmark/ae_agent_smoke/README.md
@@ -0,0 +1,13 @@
+# AE Agent Smoke Test Artifact
+
+Minimal task for quick testing of ae_agent (host/docker + evaluation). Should complete in under a minute.
+
+## Task
+
+1. In this directory (the artifact root), create a file named **success.txt**.
+2. The file must contain exactly the single character **1** (no newline required).
+3. No other steps are required.
+
+Example (bash): `echo -n 1 > success.txt`
+
+After you finish, the benchmark will run an evaluation script that checks for this file and outputs a score (1 if correct, 0 otherwise).
diff --git a/benchmarks/arteval_bench/data/benchmark/ae_agent_smoke/README_SMOKE_TEST.md b/benchmarks/arteval_bench/data/benchmark/ae_agent_smoke/README_SMOKE_TEST.md
new file mode 100644
index 00000000..bab00be4
--- /dev/null
+++ b/benchmarks/arteval_bench/data/benchmark/ae_agent_smoke/README_SMOKE_TEST.md
@@ -0,0 +1,44 @@
+# AE Agent smoke test
+
+## Purpose
+
+- Test the agent under `src/agents/ae_agent`: **host** and **docker** modes, and the **evaluation script** flow (evaluator runs after the agent and parses score).
+- Task is minimal (create `success.txt` with content `1` in the artifact root); finishes in a few minutes and avoids long runs with full arteval_tasks.
+
+## Files
+
+- **ae_agent_smoke/**: Minimal artifact
+  - `README.md`: Task description (create success.txt with content 1)
+  - `_agent_eval/check.py`: Evaluator; outputs `1` if success.txt exists and contains `1`, else `0`
+- **ae_agent_smoke_test.jsonl**: Two lines
+  - First line: `run_on_host: true`, run ae_agent + evaluator on host
+  - Second line: `run_on_host: false`, run ae_agent + evaluator in Docker
+
+## How to run
+
+From the **benchmarks/arteval_bench** directory:
+
+```bash
+# Set ANTHROPIC_API_KEY or ANTHROPIC_FOUNDRY_API_KEY first
+python src/main.py \
+  -i ./data/benchmark/ae_agent_smoke_test.jsonl \
+  -a ae_agent \
+  -m claude-sonnet-4-5-20250929 \
+  -o ./outputs/ae_agent_smoke_$(date +%Y%m%d_%H%M%S)
+```
+
+- **Host task**: Runs the agent on the host, then runs `python3 _agent_eval/check.py` on the host to get the score.
+- **Docker task**: Runs the agent in the container, then runs the evaluator in the container to get the score; the container is kept running by default for debugging.
+
+Results are under the `-o` directory: `result.jsonl` (one JSON object per line with `score`, `status`, `test_method`, etc.) and `avg_score.json`.
+
+## Interactive mode
+
+The benchmark’s `src/main.py` does not read an `interactive` field from the JSONL, so the command above only covers **non-interactive** runs. To test interactive mode:
+
+- Use ae_agent’s main entry with `--interactive`, and set `"env": "local"` or `"run_on_host": true` / `"env": "docker"` in the JSONL for the task, for example:
+  ```bash
+  cd src/agents/ae_agent
+  python -m ae_agent.main --interactive -i ../../../data/benchmark/ae_agent_smoke_test.jsonl -o ../../../outputs/ae_agent_smoke_int
+  ```
+- In interactive mode, after the first task completes you can keep typing instructions; type `quit` or `exit` to end.
diff --git a/benchmarks/arteval_bench/data/benchmark/ae_agent_smoke/_agent_eval/check.py b/benchmarks/arteval_bench/data/benchmark/ae_agent_smoke/_agent_eval/check.py
new file mode 100644
index 00000000..e0d7c479
--- /dev/null
+++ b/benchmarks/arteval_bench/data/benchmark/ae_agent_smoke/_agent_eval/check.py
@@ -0,0 +1,22 @@
+#!/usr/bin/env python3
+"""Minimal evaluator for ae_agent_smoke: output 1 if success.txt exists and contains '1', else 0.
+
+Output must be a single digit on a line (or last line) for benchmark score parsing.
+"""
+import os
+import sys
+
+def main():
+    root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+    path = os.path.join(root, "success.txt")
+    if os.path.isfile(path):
+        with open(path, "r") as f:
+            content = f.read().strip()
+        if content == "1":
+            print(1)
+            sys.exit(0)
+    print(0)
+    sys.exit(0)
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/arteval_bench/data/benchmark/ae_agent_smoke_test.jsonl b/benchmarks/arteval_bench/data/benchmark/ae_agent_smoke_test.jsonl
new file mode 100644
index 00000000..3971f37e
--- /dev/null
+++ b/benchmarks/arteval_bench/data/benchmark/ae_agent_smoke_test.jsonl
@@ -0,0 +1,2 @@
+{"artifact_id": "ae_agent_smoke_host", "artifact_dir": "ae_agent_smoke", "artifact_readme": "ae_agent_smoke/README.md", "evaluator": "python3 _agent_eval/check.py", "expected_score": 1, "run_on_host": true}
+{"artifact_id": "ae_agent_smoke_docker", "artifact_dir": "ae_agent_smoke", "artifact_readme": "ae_agent_smoke/README.md", "docker_env": "bastoica/ae-agent-ubuntu24.04:latest", "evaluator": "python3 _agent_eval/check.py", "expected_score": 1, "run_on_host": false}
diff --git a/benchmarks/arteval_bench/run_ae_agent_smoke_test.sh b/benchmarks/arteval_bench/run_ae_agent_smoke_test.sh
new file mode 100755
index 00000000..dba27087
--- /dev/null
+++ b/benchmarks/arteval_bench/run_ae_agent_smoke_test.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+# Run ae_agent smoke test under arteval_bench (host + docker, with evaluation).
+# Usage: ./run_ae_agent_smoke_test.sh [model_name]
+# Default model: claude-sonnet-4-5-20250929
+
+set -e
+BENCH_ROOT="$(cd "$(dirname "$0")" && pwd)"
+cd "$BENCH_ROOT"
+MODEL="${1:-claude-sonnet-4-5-20250929}"
+OUT_DIR="./outputs/ae_agent_smoke_$(date +%Y%m%d_%H%M%S)"
+echo "==> AE Agent smoke test (host + docker + evaluation)"
+echo "    Model: $MODEL"
+echo "    Output: $OUT_DIR"
+echo ""
+python src/main.py \
+  -i ./data/benchmark/ae_agent_smoke_test.jsonl \
+  -a ae_agent \
+  -m "$MODEL" \
+  -o "$OUT_DIR"
+echo ""
+echo "==> Done. Results: $OUT_DIR/result.jsonl and $OUT_DIR/avg_score.json"
diff --git a/benchmarks/arteval_bench/src/agents/ae_agent/README.md b/benchmarks/arteval_bench/src/agents/ae_agent/README.md
index 7ebb5acf..bf0e3aa9 100644
--- a/benchmarks/arteval_bench/src/agents/ae_agent/README.md
+++ b/benchmarks/arteval_bench/src/agents/ae_agent/README.md
@@ -1,16 +1,15 @@
 # AE Agent (ArtEval sub-agent)
 
-This agent is the **ae-agent** logic integrated as a sub-agent of the system-intelligence-benchmark ArtEval benchmark. It uses the Claude Agent SDK to run artifact evaluation tasks inside the benchmark container. Code is synced from the standalone [ae-agent](https://github.com/Couen/ae-agent) repo.
+This agent is the **ae_agent** for the system-intelligence-benchmark ArtEval benchmark, with the same logic as the standalone [ae-agent](https://github.com/Couen/ae-agent) repo. It runs inside the benchmark container using the Claude Agent SDK to execute artifact evaluation tasks.
 
 ## Files
 
-- **install.sh**: Installs `claude-agent-sdk==0.1.24` and configures `~/.claude/settings.json` (48h Bash timeout).
-- **runner.sh**: Entry point invoked as `runner.sh <model> <task_or_path>`. Forwards to `runner.py`. Uses `/agent/current_task.txt` when the benchmark passes task via file.
-- **runner.py**: Runs the task with Claude Agent SDK; supports rate-limit retry (429), message_formatter; second argument can be task text or path to file.
-- **run_eval.py**: Orchestration for one task: `env='local'` runs on host, otherwise runs in Docker (requires swerex/swe-rex).
-- **main.py**: CLI entry for batch runs from JSONL; supports both host and Docker per task (see “Run on host (local)” below).
-- **utils.py**: `DEFAULT_TIMEOUT_MS`, task/path helpers, Tee, reports, summary (used by runner, main, run_eval).
-- **interactive_runner.py**: Interactive multi-turn session inside container (e.g. `docker exec -it <cid> python3 /agent/interactive_runner.py <model>`).
+- **install.sh**: Installs `claude-agent-sdk` inside the container for use by runner.py.
+- **runner.sh**: Entry script; invoked as `runner.sh <model> <task_or_path>`. Uses `/agent/current_task.txt` when the benchmark passes the task via file.
+- **runner.py**: Runs the task with Claude Agent SDK; supports 429 rate-limit retry; second argument can be task text or path to a task file. Artifact path in container is `/repo`.
+- **run_eval.py**: Single-task orchestration: `env='local'` runs on host, otherwise runs in Docker (requires swerex/swe-rex).
+- **main.py**: CLI entry for batch runs from JSONL; supports host or Docker per task.
+- **utils.py**: Timeout, task/path helpers, Tee, reports, summary (used by runner, main, run_eval).
 - **__init__.py**: Package marker.
 
 ## Usage from the benchmark
@@ -21,44 +20,43 @@ From the benchmark root (`benchmarks/arteval_bench/`):
 python src/main.py -i ./data/benchmark/arteval_tasks.jsonl -a ae_agent -m claude-sonnet-4-5-20250929 -o ./outputs/ae_agent_run
 ```
 
-Or use the helper script from `data/benchmark/`:
-
-```bash
-./data/benchmark/run_ae_agent.sh [model_name]
-```
+You can also use `-a ae-agent`; it is equivalent to `ae_agent`.
 
 The benchmark will:
 
-1. Upload the agent to `/agent` in the container.
-2. For ae_agent: upload task to `/agent/current_task.txt`, then run `runner.sh "$model" /agent/current_task.txt` (avoids shell quoting with large tasks).
-3. Use long-running and live-log behavior (48h timeout, live log streaming, `_agent_eval` removal before run and re-upload before evaluation, container kept for debugging).
-4. Pass through `ANTHROPIC_API_KEY`, `ANTHROPIC_FOUNDRY_API_KEY`, `ANTHROPIC_FOUNDRY_BASE_URL`, `CLAUDE_CODE_USE_FOUNDRY` when set.
+1. Upload this agent to `/agent` in the container.
+2. For ae_agent: write the task to `/agent/current_task.txt`, then run `runner.sh "$model" /agent/current_task.txt` (avoids shell quoting issues with large tasks).
+3. Use long-running and live-log behavior (48h timeout, streamed logs, remove `_agent_eval` before run and re-upload before evaluation, container kept for debugging).
+4. **Evaluation script flow** (same as claude_sdk): after the agent finishes, run the JSONL `evaluator` (test_method), e.g. `cd /repo && python _agent_eval/main.py`, parse output for `score` and write to result.
+5. If set, pass through `ANTHROPIC_API_KEY`, `ANTHROPIC_FOUNDRY_API_KEY`, `ANTHROPIC_FOUNDRY_BASE_URL`, `CLAUDE_CODE_USE_FOUNDRY`.
+
+**Evaluation flow on host**: When `run_on_host=True` and the agent is ae_agent, `run_eval_in_env.run_eval_on_host` calls this package’s `run_agent_then_eval()`: run the agent first, then run `test_method` on the host (e.g. `cd project_path && python _agent_eval/main.py`), parse score with `utils.parse_eval_score()`, and return a result with the same shape as the Docker path (`score`, `test_method`, `status`).
 
 ## Dependencies
 
-- Python 3 with `claude-agent-sdk` (installed by `install.sh`).
-- Optional: `message_formatter` for prettier output (if present in the environment).
+- Python 3; `claude-agent-sdk` is installed in the container via `install.sh`.
+- When running in Docker via the benchmark’s `run_eval_in_env.py`, install `swerex` on the host (the benchmark includes it). When using this directory’s `main.py` for Docker mode standalone, you also need `swe-rex`.
 
-## Run on host (local)
+## Running on host (local)
 
-You can run tasks **on the host machine** (no Docker) from this directory:
+You can run tasks on the **host** from this directory (without the benchmark’s Docker flow):
 
-1. **Single-task / batch via main.py**  
-   Use a JSONL input where each line can set `"env": "local"` or `"run_on_host": true` to run that task on the host. Other lines without that run in Docker (if swerex is available).
+1. **Single or batch via main.py**  
+   Use a JSONL where each line can set `"env": "local"` or `"run_on_host": true` to run that task on the host; others run in Docker (requires swerex).
 
    ```bash
    cd benchmarks/arteval_bench/src/agents/ae_agent
-   python main.py -i /path/to/tasks.jsonl -a ae_agent -m claude-sonnet-4-5-20250929 -o ./outputs/host_run
+   python -m ae_agent.main -i /path/to/tasks.jsonl -a ae_agent -m claude-sonnet-4-5-20250929 -o ./outputs/host_run
    ```
 
-2. **Requirements for host mode**  
-   - `ANTHROPIC_API_KEY` or `ANTHROPIC_FOUNDRY_API_KEY` set  
+2. **Host mode requirements**  
+   - Set `ANTHROPIC_API_KEY` or `ANTHROPIC_FOUNDRY_API_KEY`  
    - Docker installed and running (for prereq check; agent runs on host)  
    - `pip install claude-agent-sdk`
 
 3. **Docker mode from this directory**  
-   If JSONL has `"env": "docker"` (or no `run_on_host`), `main.py` will run that task in Docker via `run_eval.py` (requires `swe-rex` / `swerex`).
+   If the JSONL has `"env": "docker"` (or `run_on_host` is not set), `main.py` runs that task in Docker via `run_eval.py` (requires `swe-rex`/`swerex`).
 
-## Relation to standalone ae-agent repo
+## Relation to the standalone ae-agent repo
 
-The standalone ae-agent repo provides the same host/Docker CLI. This sub-agent includes both the **in-container** runner (used by the benchmark’s `run_eval_in_env.py`) and the **host/local** mode via `main.py` and `run_eval.py`.
+The standalone ae-agent repo provides the same host/Docker CLI. This sub-agent includes both the **in-container** runner (used by the benchmark’s `run_eval_in_env.py`) and **host/local** mode via `main.py` and `run_eval.py`.
diff --git a/benchmarks/arteval_bench/src/agents/ae_agent/__init__.py b/benchmarks/arteval_bench/src/agents/ae_agent/__init__.py
index e5bc1f34..ca489f55 100644
--- a/benchmarks/arteval_bench/src/agents/ae_agent/__init__.py
+++ b/benchmarks/arteval_bench/src/agents/ae_agent/__init__.py
@@ -1,4 +1,23 @@
-"""AE Agent for ArtEvalBench - Claude Agent SDK runner for artifact evaluation tasks.
+"""AE Agent - A tool for running Claude Agent SDK on artifact evaluation tasks.
 
-Contract: artifact at /repo, this agent at /agent; task passed as CLI arg or path to file (/agent/current_task.txt).
+Output files (under save_path):
+- ae_report_<artifact_id>.md: Per-artifact report with status and agent summary
+- ae_log_<artifact_id>.log: Per-artifact execution log
+- result.jsonl: Per-task results (one JSON per line)
+- summary.json: Overall statistics
 """
+
+from .main import cli_main, main
+from .run_eval import run_agent_then_eval, run_eval
+from .runner import build_system_prompt, run_agent
+from .utils import parse_eval_score
+
+__all__ = [
+    'build_system_prompt',
+    'cli_main',
+    'main',
+    'parse_eval_score',
+    'run_agent',
+    'run_agent_then_eval',
+    'run_eval',
+]
diff --git a/benchmarks/arteval_bench/src/agents/ae_agent/install.sh b/benchmarks/arteval_bench/src/agents/ae_agent/install.sh
index 8a498c3a..829de33d 100644
--- a/benchmarks/arteval_bench/src/agents/ae_agent/install.sh
+++ b/benchmarks/arteval_bench/src/agents/ae_agent/install.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
-# Setup AE Agent environment inside benchmark container.
-# Ensures claude-agent-sdk is available so runner.py can run.
+# Setup agent running environment inside Docker container.
+# Ensures claude-agent-sdk is available so runner.py can import claude_agent_sdk.
 set -e
 if ! python3 -c "import claude_agent_sdk" 2>/dev/null; then
   echo "Installing claude-agent-sdk..."
@@ -9,14 +9,4 @@ if ! python3 -c "import claude_agent_sdk" 2>/dev/null; then
     echo "WARNING: claude_agent_sdk still not importable; runner may fail."
   fi
 fi
-# 48h Bash timeout for long-running artifact tasks
-mkdir -p ~/.claude
-cat > ~/.claude/settings.json << 'EOF'
-{
-  "env": {
-    "BASH_MAX_TIMEOUT_MS": "172800000",
-    "BASH_DEFAULT_TIMEOUT_MS": "172800000"
-  }
-}
-EOF
-echo "AE Agent environment ready (~/.claude/settings.json configured)."
+echo "Agent environment ready."
diff --git a/benchmarks/arteval_bench/src/agents/ae_agent/interactive_runner.py b/benchmarks/arteval_bench/src/agents/ae_agent/interactive_runner.py
deleted file mode 100644
index 93e3e2cd..00000000
--- a/benchmarks/arteval_bench/src/agents/ae_agent/interactive_runner.py
+++ /dev/null
@@ -1,105 +0,0 @@
-#!/usr/bin/env python3
-"""Interactive runner for AE Agent - runs inside container after main task.
-
-Used when interactive=True: docker exec -it <container_id> python3 /agent/interactive_runner.py <model>
-Artifact at /repo; API keys from container env.
-"""
-
-import asyncio
-import os
-import sys
-
-sys.path.insert(0, '/agent')
-
-try:
-    from utils import DEFAULT_TIMEOUT_MS
-except ImportError:
-    DEFAULT_TIMEOUT_MS = 172_800_000
-
-try:
-    from claude_agent_sdk import ClaudeAgentOptions, ClaudeSDKClient
-except ImportError as e:
-    print(f"ERROR: claude_agent_sdk not available: {e}", file=sys.stderr)
-    sys.exit(1)
-
-
-def _build_system_prompt() -> str:
-    try:
-        timeout_ms_env = os.environ.get("BASH_MAX_TIMEOUT_MS")
-        timeout_ms = int(timeout_ms_env) if timeout_ms_env else DEFAULT_TIMEOUT_MS
-    except ValueError:
-        timeout_ms = DEFAULT_TIMEOUT_MS
-
-    return """You are an experienced software engineer in an interactive session.
-
-ENVIRONMENT:
-- You are inside a Docker container with root permissions.
-- The artifact repository is at /repo. Change to it: cd /repo
-- You have access to Read, Write, and Bash tools.
-
-TIMEOUT: Long-running commands can take hours; do not set short timeouts.
-
-You will receive follow-up instructions from the user. Complete each one and respond.
-If the user asks to stop or says 'quit'/'exit', acknowledge and they will end the session."""
-
-
-def _display_message(msg) -> None:
-    if hasattr(msg, 'content'):
-        for block in msg.content:
-            if hasattr(block, 'text'):
-                print(block.text, end='', flush=True)
-    print(flush=True)
-
-
-async def _interactive_loop(model_name: str) -> int:
-    options = ClaudeAgentOptions(
-        system_prompt=_build_system_prompt(),
-        allowed_tools=["Read", "Write", "Bash"],
-        setting_sources=["user"],
-    )
-
-    print("\n" + "=" * 60, flush=True)
-    print("Interactive mode - Agent ready. Type your instructions (or 'quit'/'exit' to end).", flush=True)
-    print("=" * 60 + "\n", flush=True)
-
-    async with ClaudeSDKClient(options=options) as client:
-        await client.query(
-            "Please confirm you are in /repo and ready for the user's follow-up instructions. Reply briefly that you are ready."
-        )
-        async for msg in client.receive_response():
-            _display_message(msg)
-
-        while True:
-            try:
-                user_input = input("\n>>> ").strip()
-            except (EOFError, KeyboardInterrupt):
-                print("\nExiting interactive mode.", flush=True)
-                return 0
-
-            if not user_input:
-                continue
-            if user_input.lower() in ('quit', 'exit', 'q'):
-                print("Exiting interactive mode.", flush=True)
-                return 0
-
-            await client.query(user_input)
-            async for msg in client.receive_response():
-                _display_message(msg)
-
-    return 0
-
-
-def main() -> int:
-    model_name = os.environ.get("AE_AGENT_MODEL", "claude-sonnet-4-5-20250929")
-    if len(sys.argv) >= 2:
-        model_name = sys.argv[1]
-
-    if not os.environ.get('ANTHROPIC_API_KEY') and not os.environ.get('ANTHROPIC_FOUNDRY_API_KEY'):
-        print("ERROR: ANTHROPIC_API_KEY or ANTHROPIC_FOUNDRY_API_KEY must be set.", file=sys.stderr)
-        return 1
-
-    return asyncio.run(_interactive_loop(model_name))
-
-
-if __name__ == "__main__":
-    sys.exit(main())
diff --git a/benchmarks/arteval_bench/src/agents/ae_agent/main.py b/benchmarks/arteval_bench/src/agents/ae_agent/main.py
index ae36ae71..d88cdc27 100644
--- a/benchmarks/arteval_bench/src/agents/ae_agent/main.py
+++ b/benchmarks/arteval_bench/src/agents/ae_agent/main.py
@@ -1,18 +1,28 @@
-"""Main entry point for running artifact tasks (host or Docker).
+"""Main entry point for running artifact tasks.
 
 Supports both:
 - Run from this directory: env=local (host) or env=docker per task in JSONL.
-- Used as in-container runner when benchmark uploads this agent to /agent.
+- Used as in-container runner when benchmark (arteval_bench) uploads this agent to /agent.
 """
 
+from __future__ import annotations
+
 import argparse
 import json
+import logging
 import os
 import sys
+from dataclasses import dataclass
 from datetime import datetime
 
-from .run_eval import run_eval
+from .run_eval import make_error_result, run_eval
 from .utils import (
+    AGENT_SUMMARY_FALLBACK_MAX,
+    DEFAULT_MODEL,
+    LOG_OUTPUT_TRUNCATE_BYTES,
+    SUMMARY_BASENAME_TEMPLATE,
+    SUMMARY_INSTRUCTION,
+    Tee,
     compute_and_write_summary,
     docker_image_from_item,
     env_from_item,
@@ -22,135 +32,249 @@
     read_task_from_file,
     resolve_project_path,
     safe_task_id,
-    Tee,
     timeout_ms_from_item,
     write_task_report,
 )
 
 
+def _build_task_with_summary(task: str, safe_id: str) -> tuple[str, str]:
+    """Append summary instruction to task. Returns (task, summary_basename)."""
+    summary_basename = SUMMARY_BASENAME_TEMPLATE.format(safe_id=safe_id)
+    full_task = task.rstrip() + SUMMARY_INSTRUCTION.format(basename=summary_basename)
+    return full_task, summary_basename
+
+
+def _persist_result(save_path: str, result: dict, log_path: str) -> None:
+    """Write result to result.jsonl and append run output to log."""
+    with open(f'{save_path}/result.jsonl', 'a+', encoding='utf-8') as fw:
+        fw.write(json.dumps(result, ensure_ascii=False) + '\n')
+    with open(log_path, 'a', encoding='utf-8') as lf:
+        lf.write(f'\nTask finished at {result["timestamp"]}, status: {result.get("status", "unknown")}\n')
+        lf.write('\n--- Agent run output ---\n')
+        run_out = str(result.get('agent_run_results', ''))
+        lf.write(run_out[:LOG_OUTPUT_TRUNCATE_BYTES])
+        if len(run_out) > LOG_OUTPUT_TRUNCATE_BYTES:
+            lf.write('\n... (truncated)\n')
+
+
+def _gather_agent_summary(project_path: str, summary_basename: str, result: dict) -> str:
+    """Read agent summary file or fallback to truncated run output."""
+    summary_file = os.path.join(project_path, summary_basename)
+    if os.path.isfile(summary_file):
+        try:
+            with open(summary_file, encoding='utf-8') as f:
+                return f.read()
+        except OSError as e:
+            logging.warning('Failed to read summary file %s: %s', summary_file, e)
+    fallback = str(result.get('agent_run_results', ''))[:AGENT_SUMMARY_FALLBACK_MAX]
+    return fallback or '(No summary captured)'
+
+
+def _persist_skipped(save_path: str, task_id: str, message: str) -> None:
+    """Append one result line for a skipped task so summary total is accurate."""
+    result = {
+        'task_id': task_id,
+        'status': 'skipped',
+        'message': message,
+        'timestamp': datetime.now().isoformat(),
+    }
+    with open(f'{save_path}/result.jsonl', 'a+', encoding='utf-8') as fw:
+        fw.write(json.dumps(result, ensure_ascii=False) + '\n')
+
+
+def _run_single_task(
+    item: dict,
+    model: str,
+    agent: str,
+    save_path: str,
+    input_file: str,
+    interactive_default: bool,
+) -> None:
+    """Process a single JSONL task: parse, run, write results and report."""
+    env = env_from_item(item)
+    docker_image = docker_image_from_item(item, env=env)
+    use_gpu = gpu_from_item(item)
+    interactive = interactive_from_item(item) or interactive_default
+    task_file = item.get('artifact_readme', None)
+    task_id = item.get('artifact_id', None)
+    timeout_ms = timeout_ms_from_item(item)
+    safe_id = safe_task_id(task_id)
+
+    project_path, path_error = resolve_project_path(item, input_file, save_path)
+    if path_error:
+        print(path_error)
+        _persist_skipped(save_path, task_id or safe_id, path_error)
+        return
+    print(f'Project path: {project_path}')
+
+    raw_task = read_task_from_file(project_path, task_file) if task_file else get_task('README.md')
+    task, summary_basename = _build_task_with_summary(raw_task, safe_id)
+
+    task_file_path = os.path.join(save_path, f'current_task_{safe_id}.txt')
+    with open(task_file_path, 'w', encoding='utf-8') as f:
+        f.write(task)
+
+    timeout_str = str(timeout_ms) if timeout_ms is not None else 'default'
+    print(f'Task {task_id}: env={env}, timeout_ms={timeout_str}, gpu={use_gpu}, interactive={interactive}')
+
+    log_path = os.path.join(save_path, f'ae_log_{safe_id}.log')
+    with open(log_path, 'w', encoding='utf-8') as lf:
+        lf.write(f'Task {task_id} started at {datetime.now().isoformat()}\n')
+        lf.write(f'Project path: {project_path}\n')
+        lf.write(f'Env: {env}\n\n')
+
+    # Run task (stdout/stderr teed to log), then persist result and report.
+    # Note: For env='local', agent_path is ignored; the in-process runner (this package) is used.
+    old_stdout, old_stderr = sys.stdout, sys.stderr
+    try:
+        with Tee(sys.stdout, log_path) as tee_out:
+            with Tee(sys.stderr, log_path) as tee_err:
+                sys.stdout, sys.stderr = tee_out, tee_err
+                result = run_eval(
+                    env=env,
+                    docker_image=docker_image,
+                    project_path=project_path,
+                    task_id=task_id,
+                    task=task,
+                    task_file_path=task_file_path,
+                    model=model,
+                    agent_path=agent,
+                    save_path=save_path,
+                    timeout_ms=timeout_ms,
+                    use_gpu=use_gpu,
+                    interactive=interactive,
+                )
+    except Exception as e:
+        sys.stdout, sys.stderr = old_stdout, old_stderr
+        logging.exception('run_eval failed for task %s: %s', task_id, e)
+        result = make_error_result(task_id, task, project_path, str(e), env)
+    finally:
+        sys.stdout, sys.stderr = old_stdout, old_stderr
+
+    result['timestamp'] = datetime.now().isoformat()
+    result['log_file'] = log_path
+    _persist_result(save_path, result, log_path)
+
+    agent_summary = _gather_agent_summary(project_path, summary_basename, result)
+    write_task_report(save_path, safe_id, task_id, result, log_path, agent_summary)
+    print(f'Task {task_id} completed. Status: {result.get("status", "unknown")}')
+
+
 def main(input_file, model, agent, save_path, interactive_default: bool = False):
     """Main function for running tasks."""
+    if not os.path.isfile(input_file):
+        logging.error('Input file not found: %s', input_file)
+        sys.exit(1)
+
     print(f'Using model: {model}, agent: {agent}')
 
-    with open(input_file) as f:
-        for line in f:
+    with open(input_file, encoding='utf-8') as f:
+        for line_no, line in enumerate(f, start=1):
             if not line.strip():
                 continue
             try:
                 item = json.loads(line)
-            except json.JSONDecodeError:
-                print(f'Skipping invalid JSON line: {line}')
+            except json.JSONDecodeError as e:
+                print(f'Skipping invalid JSON at line {line_no}: {e}')
+                _persist_skipped(save_path, f'line_{line_no}', f'Invalid JSON: {e}')
                 continue
 
-            env = env_from_item(item)
-            docker_image = docker_image_from_item(item)
-            use_gpu = gpu_from_item(item)
-            interactive = interactive_from_item(item) or interactive_default
-            task_file = item.get("artifact_readme", None)
-            task_id = item.get("artifact_id", None)
-            timeout_ms = timeout_ms_from_item(item)
-            safe_id = safe_task_id(task_id)
-
-            project_path, path_error = resolve_project_path(item, input_file, save_path)
-            if path_error:
-                print(path_error)
-                continue
-            print(f"Project path: {project_path}")
+            _run_single_task(
+                item=item,
+                model=model,
+                agent=agent,
+                save_path=save_path,
+                input_file=input_file,
+                interactive_default=interactive_default,
+            )
 
-            task = read_task_from_file(project_path, task_file) if task_file else get_task("README.md")
-            summary_basename = f'ae_summary_{safe_id}.md'
-            task = task.rstrip() + f"\n\nAt the end, write a brief summary of what you did and the result to {summary_basename} in the artifact root (so it can be included in the report)."
-
-            task_file_path = os.path.join(save_path, f'current_task_{safe_id}.txt')
-            with open(task_file_path, 'w', encoding='utf-8') as f:
-                f.write(task)
+    total_count, success_count = compute_and_write_summary(save_path)
+    print(f'All tasks completed: {success_count}/{total_count} succeeded.')
 
-            print(f"Task {task_id}: env={env}, timeout_ms={timeout_ms if timeout_ms is not None else 'default'}, gpu={use_gpu}, interactive={interactive}")
 
-            log_path = os.path.join(save_path, f'ae_log_{safe_id}.log')
-            with open(log_path, 'w', encoding='utf-8') as lf:
-                lf.write(f"Task {task_id} started at {datetime.now().isoformat()}\n")
-                lf.write(f"Project path: {project_path}\n")
-                lf.write(f"Env: {env}\n\n")
-            old_stdout, old_stderr = sys.stdout, sys.stderr
-            try:
-                with Tee(sys.stdout, log_path) as tee_out:
-                    with Tee(sys.stderr, log_path) as tee_err:
-                        sys.stdout, sys.stderr = tee_out, tee_err
-                        result = run_eval(
-                            env=env,
-                            docker_image=docker_image,
-                            project_path=project_path,
-                            task_id=task_id,
-                            task=task,
-                            model=model,
-                            agent_path=agent,
-                            save_path=save_path,
-                            timeout_ms=timeout_ms,
-                            use_gpu=use_gpu,
-                            task_file_path=task_file_path,
-                            interactive=interactive,
-                        )
-            finally:
-                sys.stdout, sys.stderr = old_stdout, old_stderr
-
-            result["timestamp"] = datetime.now().isoformat()
-            result["log_file"] = log_path
-            with open(f"{save_path}/result.jsonl", "a+", encoding="utf-8") as fw:
-                fw.write(json.dumps(result, ensure_ascii=False) + "\n")
-            with open(log_path, "a", encoding="utf-8") as lf:
-                lf.write(f"\nTask finished at {result['timestamp']}, status: {result.get('status', 'unknown')}\n")
-                lf.write("\n--- Agent run output ---\n")
-                run_out = str(result.get("agent_run_results", ""))
-                lf.write(run_out[:50000])
-                if len(run_out) > 50000:
-                    lf.write("\n... (truncated)\n")
-
-            summary_file = os.path.join(project_path, summary_basename)
-            agent_summary = ""
-            if os.path.isfile(summary_file):
-                try:
-                    with open(summary_file, "r", encoding="utf-8") as f:
-                        agent_summary = f.read()
-                except Exception:
-                    pass
-            if not agent_summary:
-                agent_summary = (str(result.get("agent_run_results", ""))[:8000] or "(No summary captured)")
-            write_task_report(save_path, safe_id, task_id, result, log_path, agent_summary)
-            print(f"Task {task_id} completed. Status: {result.get('status', 'unknown')}")
+@dataclass
+class _ResolvedConfig:
+    """Resolved CLI configuration ready for main()."""
 
-    total_count, success_count = compute_and_write_summary(save_path)
-    print(f"All tasks completed: {success_count}/{total_count} succeeded.")
+    input_file: str
+    model: str
+    agent: str
+    save_path: str
+    interactive_default: bool
 
 
-def cli_main():
-    """CLI entry point."""
-    parser = argparse.ArgumentParser(description='AE Agent - Run Claude Agent SDK on artifact tasks (host or Docker)')
-    parser.add_argument('-i', '--input_file', help='Input JSONL file with tasks', default='./data/benchmark/arteval_tasks.jsonl')
+def _parse_args() -> argparse.Namespace:
+    """Parse command-line arguments."""
+    parser = argparse.ArgumentParser(description='AE Agent - Run Claude Agent SDK on artifact tasks')
+    parser.add_argument(
+        '-i',
+        '--input_file',
+        help='Input JSONL file with tasks',
+        default='./data/benchmark/arteval_tasks.jsonl',
+    )
     parser.add_argument('-o', '--save_path', help='Result save path', default=None)
-    parser.add_argument('-a', '--agent', help='Agent name (default: ae_agent)', default='ae_agent')
-    parser.add_argument('-m', '--model_name', help='Model Name', default='claude-sonnet-4-5-20250929')
-    parser.add_argument('--interactive', action='store_true', help='Enable interactive mode (continue giving agent instructions after task completes)')
-    args = parser.parse_args()
+    parser.add_argument(
+        '-a',
+        '--agent',
+        help='Agent name (default: ae-agent)',
+        default='ae-agent',
+    )
+    parser.add_argument(
+        '-m',
+        '--model_name',
+        help='Model Name',
+        default=DEFAULT_MODEL,
+    )
+    parser.add_argument(
+        '--interactive',
+        action='store_true',
+        help='Enable interactive mode (continue giving agent instructions after task completes)',
+    )
+    return parser.parse_args()
+
+
+def _resolve_paths(args: argparse.Namespace) -> _ResolvedConfig:
+    """Resolve paths and agent from parsed args."""
     model_name = args.model_name
     agent = args.agent
     input_file = args.input_file
     save_path = args.save_path
+
     if save_path is None:
         str_model_name = model_name.replace('/', '_').lower()
         timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
-        save_path = os.path.join('./outputs', f'ae_{str_model_name}_ae_agent_{timestamp}')
-    # When running from this directory, use it as agent path
+        save_path = os.path.join('./outputs', f'ae_{str_model_name}_ae-agent_{timestamp}')
+
+    # When running from this directory (standalone or as arteval_bench agent), use script dir as agent path
     if agent in ('ae-agent', 'ae_agent', 'claude_sdk'):
-        script_dir = os.path.dirname(os.path.abspath(__file__))
-        agent = script_dir
+        agent = os.path.dirname(os.path.abspath(__file__))
+
     save_path = os.path.abspath(os.path.expanduser(save_path))
     os.makedirs(save_path, exist_ok=True)
-    interactive_default = getattr(args, 'interactive', False)
-    print(f"Input file: {input_file}")
-    print(f"Save path: {save_path}")
-    print(f"Agent path: {agent}")
-    main(input_file, model_name, agent, save_path, interactive_default=interactive_default)
+
+    return _ResolvedConfig(
+        input_file=input_file,
+        model=model_name,
+        agent=agent,
+        save_path=save_path,
+        interactive_default=getattr(args, 'interactive', False),
+    )
+
+
+def cli_main():
+    """CLI entry point."""
+    args = _parse_args()
+    config = _resolve_paths(args)
+    print(f'Input file: {config.input_file}')
+    print(f'Save path: {config.save_path}')
+    print(f'Agent path: {config.agent}')
+    main(
+        config.input_file,
+        config.model,
+        config.agent,
+        config.save_path,
+        interactive_default=config.interactive_default,
+    )
 
 
 if __name__ == '__main__':
diff --git a/benchmarks/arteval_bench/src/agents/ae_agent/run_eval.py b/benchmarks/arteval_bench/src/agents/ae_agent/run_eval.py
index a83e8b91..dd14ce25 100644
--- a/benchmarks/arteval_bench/src/agents/ae_agent/run_eval.py
+++ b/benchmarks/arteval_bench/src/agents/ae_agent/run_eval.py
@@ -1,622 +1,872 @@
-"""Runner for executing artifact tasks in Docker or on host.
+"""Orchestration for executing artifact tasks in Docker or on host.
 
-Single entry point: run_eval(env, project_path, task_id, task, model, agent_path, save_path, ...).
-- env='local' → run on host (internal: _run_local).
-- env != 'local' → run in Docker (internal: run_eval_in_env).
+Single entry point: run_eval(env, project_path, task_id, ...).
+- env='local'  -> _run_local()      -> runner.run_agent() directly on host
+- env != 'local' -> _run_in_docker() -> runner.py executed inside container
 """
 
+from __future__ import annotations
+
 import asyncio
 import json
+import logging
 import os
 import shutil
 import subprocess
 import sys
 import tempfile
+import time
+from dataclasses import dataclass
 from pathlib import Path
 
-from .utils import DEFAULT_TIMEOUT_MS, safe_task_id
+from .runner import run_agent
+from .utils import (
+    DEFAULT_DOCKER_IMAGE,
+    apply_timeout_env,
+    has_api_key,
+    is_local_env,
+    parse_eval_score,
+    resolve_timeout_ms,
+    safe_task_id,
+    status_from_exit_code,
+    timeout_env_dict,
+)
 
-# Try to import SWE-ReX (historically called "swerex") for Docker deployment.
 SWEREX_AVAILABLE = False
 
+
+def _import_swerex():
+    """Try importing swerex under both package names (swerex and swe_rex).
+
+    The package was renamed; we support both for backward compatibility.
+    Returns (DockerDeploymentConfig, BashAction, CreateBashSessionRequest, UploadRequest)
+    or raises ImportError.
+    """
+    for pkg in ('swerex', 'swe_rex'):
+        try:
+            mod_docker = __import__(f'{pkg}.deployment.docker', fromlist=['DockerDeploymentConfig'])
+            mod_runtime = __import__(
+                f'{pkg}.runtime.abstract', fromlist=['BashAction', 'CreateBashSessionRequest', 'UploadRequest']
+            )
+            return (
+                mod_docker.DockerDeploymentConfig,
+                mod_runtime.BashAction,
+                mod_runtime.CreateBashSessionRequest,
+                mod_runtime.UploadRequest,
+            )
+        except ImportError:
+            continue
+    raise ImportError("Neither 'swerex' nor 'swe_rex' is installed")
+
+
 try:
-    from swerex.deployment.docker import DockerDeploymentConfig
-    from swerex.runtime.abstract import BashAction, Command, CreateBashSessionRequest, UploadRequest
+    DockerDeploymentConfig, BashAction, CreateBashSessionRequest, UploadRequest = _import_swerex()
     SWEREX_AVAILABLE = True
 except ImportError:
-    try:
-        from swe_rex.deployment.docker import DockerDeploymentConfig
-        from swe_rex.runtime.abstract import BashAction, Command, CreateBashSessionRequest, UploadRequest
-        SWEREX_AVAILABLE = True
-    except ImportError:
-        SWEREX_AVAILABLE = False
-        print("WARNING: swerex/swe-rex not available. Docker mode will not work.", file=sys.stderr)
-
-
-def build_system_prompt(artifact_path: str, task: str) -> str:
-    """Build the system prompt for running an artifact task on the host."""
-    return f"""You are an experienced software engineer completing an artifact task.
-
-ENVIRONMENT SETUP (HOST MACHINE - NOT DOCKER):
-- You are running DIRECTLY on the host machine (NOT inside a Docker container)
-- Docker daemon is already running on this host
-- When you use Kind to create Kubernetes clusters, they will be created using the host's Docker
-- This avoids Docker-in-Docker compatibility issues
-- You may need sudo for some operations
-
-ARTIFACT LOCATION:
-- The artifact repository is located at: {artifact_path}
-- Start by changing to this directory: cd {artifact_path}
-
-YOUR TASK:
-{task}
-
-TIMEOUT CONFIGURATION (CRITICAL):
-- Long-running commands (builds, tests, Kind cluster creation) are expected
-- DO NOT set short timeouts - let commands complete naturally
-- Kind cluster creation can take 5-10 minutes
-- Full benchmark runs can take hours
-
-IMPORTANT GUIDELINES:
-1. First, cd to {artifact_path} and examine the directory structure
-2. Follow the README instructions step by step
-3. You MUST execute every verification step, test, or command that the README (or referenced docs like TESTBED.md) says is required for evaluation or reproduction. Do NOT skip any such step just because the README mentions that it may take a long time. Long runtimes are expected; run each verification and wait for completion.
-4. If you see 'sudo' in instructions, you can use it (or skip if already root)
-5. Use the Bash tool to run commands, Read tool to inspect files
-6. Work systematically through setup, build, and experiment execution
-7. If you encounter errors, debug and resolve them using available tools
-8. For Kind clusters, they will work properly since you're on the host (not DinD)"""
-
-
-def check_prerequisites_on_host() -> bool:
-    """Check that required tools (docker, python, API key) are available on the host. Returns True if OK."""
-    if not shutil.which("docker"):
-        print("ERROR: Docker is not installed on host.", file=sys.stderr)
+    logging.warning('swerex/swe-rex not available. Docker mode will not work.')
+
+
+# Progress log every 5 minutes when runner is still running.
+_PROGRESS_LOG_INTERVAL_SEC = 300
+
+# Poll interval for checking runner status.
+_POLL_INTERVAL_SEC = 10.0
+
+
+@dataclass
+class _RunnerResult:
+    """Result from a Docker runner process."""
+
+    exit_code: int
+    output: str
+
+
+def _make_eval_result(
+    task_id: str,
+    task: str,
+    project_path: str,
+    agent_output: str,
+    status: str,
+    run_on_host: bool,
+    *,
+    container_id: str | None = None,
+    saved_image: str | None = None,
+    container_stopped: bool = False,
+    message_count: int | None = None,
+    score: int | None = None,
+    test_method: str | None = None,
+) -> dict:
+    """Build unified eval result dict for both host and Docker modes."""
+    result = {
+        'task_id': task_id,
+        'task': task,
+        'project_path': project_path,
+        'agent_run_results': agent_output,
+        'status': status,
+        'run_on_host': run_on_host,
+        'container_id': container_id,
+        'saved_image': saved_image,
+        'container_stopped': container_stopped,
+    }
+    if message_count is not None:
+        result['message_count'] = message_count
+    if score is not None:
+        result['score'] = score
+    if test_method is not None:
+        result['test_method'] = test_method
+    return result
+
+
+def make_error_result(
+    task_id: str,
+    task: str,
+    project_path: str,
+    error_message: str,
+    env: str,
+) -> dict:
+    """Build result dict for run_eval failure (exception/timeout). Same shape as normal result."""
+    return _make_eval_result(
+        task_id,
+        task,
+        project_path,
+        error_message,
+        'error',
+        is_local_env(env),
+    )
+
+
+# ---------------------------------------------------------------------------
+# Host mode
+# ---------------------------------------------------------------------------
+
+
+def _check_host_prerequisites() -> bool:
+    """Check that docker, python, and API key are available on the host."""
+    if not shutil.which('docker'):
+        logging.error('Docker is not installed on host.')
         return False
-    result = subprocess.run(["docker", "ps"], capture_output=True, timeout=10)
-    if result.returncode != 0:
-        print("ERROR: Docker is not running on host.", file=sys.stderr)
+    if subprocess.run(['docker', 'ps'], capture_output=True, timeout=10).returncode != 0:
+        logging.error('Docker is not running on host.')
         return False
-    if not os.environ.get("ANTHROPIC_API_KEY") and not os.environ.get("ANTHROPIC_FOUNDRY_API_KEY"):
-        print("ERROR: ANTHROPIC_API_KEY or ANTHROPIC_FOUNDRY_API_KEY must be set.", file=sys.stderr)
+    if not has_api_key():
+        logging.error('Neither ANTHROPIC_API_KEY nor ANTHROPIC_FOUNDRY_API_KEY is set.')
         return False
     return True
 
 
-def setup_claude_settings_on_host(timeout_ms: int):
-    """Set up ~/.claude/settings.json with timeout configuration on host."""
-    claude_dir = Path.home() / ".claude"
-    settings_file = claude_dir / "settings.json"
+def _write_claude_settings(timeout_ms: int):
+    """Write ~/.claude/settings.json with timeout configuration."""
+    claude_dir = Path.home() / '.claude'
     claude_dir.mkdir(exist_ok=True)
-    settings = {
-        "env": {
-            "BASH_MAX_TIMEOUT_MS": str(timeout_ms),
-            "BASH_DEFAULT_TIMEOUT_MS": str(timeout_ms),
-        }
-    }
-    with open(settings_file, 'w') as f:
+    settings = {'env': timeout_env_dict(timeout_ms)}
+    with open(claude_dir / 'settings.json', 'w', encoding='utf-8') as f:
         json.dump(settings, f, indent=2)
-    print(f"Created {settings_file} with timeout configuration: {timeout_ms} ms.")
 
 
 async def _run_local(
-    project_path, task_id, task, model, agent_path, save_path, timeout_ms: int, *,
-    skip_prereq_check: bool = False, interactive: bool = False
+    project_path,
+    task_id,
+    task,
+    model,
+    timeout_ms: int,
+    *,
+    skip_prereq_check: bool = False,
+    interactive: bool = False,
 ):
-    """Internal: run one task on the host (no Docker). Used by run_eval when env='local'."""
-    print("=" * 80)
-    print("Running task directly on HOST MACHINE (not in Docker)")
-    print("=" * 80)
+    """Run one task on host by delegating to runner.run_agent()."""
+    print('=' * 80)
+    print('Running task on HOST MACHINE')
+    print('=' * 80)
 
-    if not skip_prereq_check and not check_prerequisites_on_host():
-        raise RuntimeError("Host prerequisites check failed (docker, ANTHROPIC_API_KEY)")
+    if not skip_prereq_check and not _check_host_prerequisites():
+        raise RuntimeError('Host prerequisites check failed')
 
-    setup_claude_settings_on_host(timeout_ms)
+    _write_claude_settings(timeout_ms)
+    # run_eval() already calls apply_timeout_env() for local; no need to duplicate here.
 
     project_path = os.path.abspath(project_path)
     if not os.path.isdir(project_path):
-        raise RuntimeError(f"Project path does not exist: {project_path}")
-
-    print(f"Project path: {project_path}")
-    print(f"Task ID: {task_id}")
-    print(f"Model: {model}")
+        raise RuntimeError(f'Project path does not exist: {project_path}')
+
+    print(f'Project path: {project_path}')
+    print(f'Task ID: {task_id}')
+    print(f'Model: {model}')
+
+    agent_result = await run_agent(
+        model,
+        task,
+        env='local',
+        artifact_path=project_path,
+        timeout_ms=timeout_ms,
+        interactive=interactive,
+    )
 
-    try:
-        from claude_agent_sdk import ClaudeAgentOptions
-        if interactive:
-            from claude_agent_sdk import ClaudeSDKClient
-        else:
-            from claude_agent_sdk import query
-    except ImportError as e:
-        raise RuntimeError(f"claude_agent_sdk not installed: {e}. Install with: pip install claude-agent-sdk")
-
-    system_prompt = build_system_prompt(project_path, task)
-    options = ClaudeAgentOptions(
-        system_prompt=system_prompt,
-        allowed_tools=["Read", "Write", "Bash"],
-        setting_sources=["user"],
+    return _make_eval_result(
+        task_id,
+        task,
+        project_path,
+        agent_result['output'],
+        status_from_exit_code(agent_result['exit_code']),
+        run_on_host=True,
+        message_count=agent_result['message_count'],
     )
 
-    os.environ['BASH_MAX_TIMEOUT_MS'] = str(timeout_ms)
-    os.environ['BASH_DEFAULT_TIMEOUT_MS'] = str(timeout_ms)
 
-    message_count = 0
-    run_results_output = ""
+# ---------------------------------------------------------------------------
+# Benchmark flow: run agent then evaluation script (same as claude_sdk in arteval_bench)
+# ---------------------------------------------------------------------------
 
-    if interactive:
-        print("Starting Claude Agent SDK (Host Mode, Interactive)...")
-        async with ClaudeSDKClient(options=options) as client:
-            await client.query(
-                f"Please start the artifact task. Begin by changing to the artifact directory at {project_path} and examining its contents."
-            )
-            async for message in client.receive_response():
-                message_count += 1
-                if message_count % 10 == 0:
-                    print(f"[Progress] Processed {message_count} messages...")
-                msg_str = str(message)
-                print(msg_str)
-                if 'ResultMessage' in msg_str or 'TextBlock' in msg_str:
-                    run_results_output = msg_str
-
-            print(f"Claude Agent SDK execution completed. Total messages: {message_count}")
-            print("\n" + "=" * 60)
-            print("Interactive mode - Type your follow-up instructions (or 'quit'/'exit' to end).")
-            print("=" * 60 + "\n")
-
-            while True:
-                try:
-                    user_input = input("\n>>> ").strip()
-                except (EOFError, KeyboardInterrupt):
-                    print("\nExiting interactive mode.", flush=True)
-                    break
-                if not user_input:
-                    continue
-                if user_input.lower() in ('quit', 'exit', 'q'):
-                    print("Exiting interactive mode.", flush=True)
-                    break
-
-                await client.query(user_input)
-                async for msg in client.receive_response():
-                    msg_str = str(msg)
-                    print(msg_str)
-                    if 'ResultMessage' in msg_str or 'TextBlock' in msg_str:
-                        run_results_output = msg_str
-    else:
-        print("Starting Claude Agent SDK (Host Mode)...")
-        try:
-            async for message in query(
-                prompt=f"Please start the artifact task. Begin by changing to the artifact directory at {project_path} and examining its contents.",
-                options=options
-            ):
-                message_count += 1
-                if message_count % 10 == 0:
-                    print(f"[Progress] Processed {message_count} messages...")
-                msg_str = str(message)
-                print(msg_str)
-                if 'ResultMessage' in msg_str or 'TextBlock' in msg_str:
-                    run_results_output = msg_str
-
-            print(f"Claude Agent SDK execution completed. Total messages: {message_count}")
+# Default timeout for running the evaluation script (e.g. pytest or oracle script) on host.
+_EVAL_SCRIPT_TIMEOUT_SEC = 600
 
-        except Exception as e:
-            print(f"ERROR: Claude Agent SDK execution failed: {e}")
-            import traceback
-            traceback.print_exc()
-            run_results_output = f"Error: {e}"
 
-    result = {
-        'task_id': task_id,
-        'task': task,
-        'project_path': project_path,
-        'agent_run_results': run_results_output,
-        'message_count': message_count,
-        'status': 'success' if message_count > 0 else 'error',
-        'run_on_host': True,
-        'container_id': None,
-        'saved_image': None,
-        'container_stopped': False,
-    }
+async def _run_agent_then_eval_async(
+    project_path: str,
+    task_id: str,
+    task: str,
+    model: str,
+    test_method: str | None,
+    save_path: str,
+    timeout_ms: int | None = None,
+    *,
+    skip_prereq_check: bool = False,
+) -> dict:
+    """Run agent on host, then run evaluation script (test_method); return result with score.
 
-    return result
+    Used by arteval_bench when run_on_host=True and agent is ae_agent. Same flow as claude_sdk:
+    agent run → run test_method (e.g. cd project_path && python _agent_eval/main.py) → parse score.
+    """
+    timeout_ms = resolve_timeout_ms(timeout_ms)
+    if not skip_prereq_check and not _check_host_prerequisites():
+        raise RuntimeError('Host prerequisites check failed')
+    apply_timeout_env(timeout_ms)
+    _write_claude_settings(timeout_ms)
 
+    project_path = os.path.abspath(project_path)
+    if not os.path.isdir(project_path):
+        raise RuntimeError(f'Project path does not exist: {project_path}')
+
+    # 1. Run agent
+    agent_result = await run_agent(
+        model,
+        task,
+        env='local',
+        artifact_path=project_path,
+        timeout_ms=timeout_ms,
+        interactive=False,
+    )
+    agent_output = agent_result['output']
+    agent_status = status_from_exit_code(agent_result['exit_code'])
 
-def _save_container_as_image(container_id: str, project_path: str, task_id: str) -> tuple[str | None, bool]:
-    """Save Docker container as image (docker cp, commit, stop). Returns (saved_image_tag or None, container_stopped)."""
-    project_path_abs = os.path.abspath(project_path)
-    if os.path.isdir(project_path_abs):
+    # 2. Run evaluation script if provided
+    if test_method and test_method.strip():
         try:
-            cp_proc = subprocess.run(
-                ["docker", "cp", f"{container_id}:/repo/.", project_path_abs],
+            eval_cmd = f'cd {project_path} && {test_method}'
+            eval_result = subprocess.run(
+                eval_cmd,
+                shell=True,
                 capture_output=True,
                 text=True,
-                timeout=600,
+                timeout=_EVAL_SCRIPT_TIMEOUT_SEC,
             )
-            if cp_proc.returncode == 0:
-                print(f"Synced container /repo to host workspace: {project_path_abs}")
-            else:
-                print(
-                    f"WARNING: docker cp failed (container {container_id} -> {project_path_abs}): "
-                    f"{cp_proc.stderr.strip()}"
-                )
+            test_output = (eval_result.stdout or '').strip()
+            score = parse_eval_score(test_output)
+            status = 'success' if agent_status == 'success' else agent_status
         except subprocess.TimeoutExpired:
-            print(f"WARNING: docker cp timed out copying /repo from container {container_id}")
+            test_output = '(evaluation script timed out)'
+            score = 0
+            status = 'error'
         except Exception as e:
-            print(f"WARNING: Exception during docker cp from container {container_id}: {e}")
+            test_output = str(e)
+            score = 0
+            status = f'error: {e}'
     else:
-        print(f"WARNING: project_path does not exist, skipping workspace sync: {project_path_abs}")
+        test_output = ''
+        score = 0
+        status = agent_status
+
+    return _make_eval_result(
+        task_id,
+        task,
+        project_path,
+        agent_output,
+        status,
+        run_on_host=True,
+        score=score,
+        test_method=test_method or '',
+    )
 
-    sid = safe_task_id(task_id, fallback="unknown_task")
-    saved_image = f"ae-agent-{sid.lower()}:latest"
-    try:
-        commit_proc = subprocess.run(
-            ["docker", "commit", container_id, saved_image],
-            capture_output=True,
-            text=True,
-            timeout=600,
+
+def run_agent_then_eval(
+    project_path: str,
+    task_id: str,
+    task: str,
+    model: str,
+    test_method: str | None,
+    save_path: str,
+    timeout_ms: int | None = None,
+    *,
+    skip_prereq_check: bool = False,
+) -> dict:
+    """Synchronous entry: run agent on host then evaluation script; return result with score.
+
+    Called by arteval_bench run_eval_in_env.run_eval_on_host when agent is ae_agent.
+    """
+    return asyncio.run(
+        _run_agent_then_eval_async(
+            project_path,
+            task_id,
+            task,
+            model,
+            test_method,
+            save_path,
+            timeout_ms=timeout_ms,
+            skip_prereq_check=skip_prereq_check,
         )
-        if commit_proc.returncode == 0:
-            print(f"Saved container {container_id} as image '{saved_image}'.")
-        else:
-            print(
-                f"WARNING: docker commit failed for container {container_id}: "
-                f"{commit_proc.stderr.strip()}"
-            )
-            saved_image = None
-    except Exception as e:
-        print(f"WARNING: Exception during docker commit for container {container_id}: {e}")
-        saved_image = None
+    )
+
+
+# ---------------------------------------------------------------------------
+# Docker helpers
+# ---------------------------------------------------------------------------
+
+
+def _validate_agent_path(agent_path: str) -> None:
+    """Ensure agent_path exists and has required files."""
+    if not agent_path or not os.path.isdir(agent_path):
+        raise RuntimeError(f'Agent path does not exist: {agent_path}')
+    for name in ('runner.sh', 'runner.py', 'install.sh'):
+        if not os.path.isfile(os.path.join(agent_path, name)):
+            raise RuntimeError(f'Agent path missing required file: {name}')
+
 
-    container_stopped = False
+def _stdin_is_tty() -> bool:
+    """Return True if stdin is a real TTY (required for docker exec -it)."""
+    return hasattr(sys.stdin, 'isatty') and sys.stdin.isatty()
+
+
+def _run_docker_cmd(
+    args: list[str],
+    *,
+    timeout: int = 60,
+    on_success_message: str | None = None,
+    on_fail_message: str = 'docker command failed',
+) -> bool:
+    """Run a docker subprocess. Return True if returncode is 0, else False and log."""
     try:
-        stop_proc = subprocess.run(
-            ["docker", "stop", container_id],
+        r = subprocess.run(
+            args,
             capture_output=True,
             text=True,
-            timeout=60,
+            timeout=timeout,
         )
-        if stop_proc.returncode == 0:
-            print(f"Stopped container {container_id}.")
-            container_stopped = True
-        else:
-            print(
-                f"WARNING: docker stop failed for container {container_id}: "
-                f"{stop_proc.stderr.strip()}"
-            )
-    except Exception as e:
-        print(f"WARNING: Exception during docker stop for container {container_id}: {e}")
+        if r.returncode == 0:
+            if on_success_message:
+                print(on_success_message)
+            return True
+        logging.warning('%s: %s', on_fail_message, (r.stderr or r.stdout or '').strip())
+        return False
+    except subprocess.TimeoutExpired:
+        logging.warning('docker command timed out (timeout=%ds)', timeout)
+        return False
+    except (OSError, subprocess.SubprocessError) as e:
+        logging.warning('docker command error: %s', e)
+        return False
 
-    return (saved_image, container_stopped)
 
+def _merge_tree(src_dir: str, dst_dir: str, exclude: tuple[str, ...] = ('.venv', '.git', '__pycache__')) -> None:
+    """Merge src_dir into dst_dir (copy missing/updated from src into dst)."""
+    os.makedirs(dst_dir, exist_ok=True)
+    for name in os.listdir(src_dir):
+        if name in exclude:
+            continue
+        src_path = os.path.join(src_dir, name)
+        dst_path = os.path.join(dst_dir, name)
+        if os.path.isdir(src_path):
+            if os.path.isdir(dst_path):
+                _merge_tree(src_path, dst_path, exclude)
+            elif os.path.exists(dst_path):
+                logging.warning('Sync skip (destination not a dir): %s', dst_path)
+            else:
+                shutil.copytree(src_path, dst_path)
+        else:
+            try:
+                shutil.copy2(src_path, dst_path)
+            except OSError as e:
+                logging.warning('Sync copy failed %s -> %s: %s', src_path, dst_path, e)
 
-def _validate_agent_path(agent_path: str) -> None:
-    """Ensure agent_path exists and has required files. Raises RuntimeError if invalid."""
-    if not agent_path or not os.path.isdir(agent_path):
-        raise RuntimeError(f"Agent path does not exist or is not a directory: {agent_path}")
-    required = ["runner.sh", "runner.py", "install.sh"]
-    for f in required:
-        p = os.path.join(agent_path, f)
-        if not os.path.isfile(p):
-            raise RuntimeError(f"Agent path missing required file: {f} (expected at {p})")
 
+def _sync_workspace(container_id: str, project_path: str) -> None:
+    """Copy /repo from container back to host project_path.
 
-async def run_eval_in_env(
-    deployment, project_path, task_id, task, model, agent_path, save_path,
-    task_file_path: str | None = None, interactive: bool = False
-):
-    """Run task in Docker container."""
-    if not SWEREX_AVAILABLE:
-        raise RuntimeError("swerex is not available. Cannot run in Docker mode.")
+    Uses a temp copy plus merge with excludes to avoid overwriting host .venv
+    (e.g. when container has .venv/lib64 as a directory and host has it as a
+    symlink, which would cause 'cannot overwrite non-directory with directory').
+    """
+    project_abs = os.path.abspath(project_path)
+    if not os.path.isdir(project_abs):
+        print(f'WARNING: project_path missing, skipping sync: {project_abs}')
+        return
+
+    # Exclude .venv* and .git to avoid overwriting host venv or permission issues
+    def _skip_sync(name: str) -> bool:
+        return name == '.git' or name == '.venv' or name.startswith('.venv-')
+
+    with tempfile.TemporaryDirectory(prefix='ae_sync_') as tmp:
+        dest_tmp = os.path.join(tmp, 'repo')
+        if not _run_docker_cmd(
+            ['docker', 'cp', f'{container_id}:/repo', dest_tmp],
+            timeout=600,
+            on_fail_message='docker cp (to temp) failed',
+        ):
+            return
+        # docker cp container:/repo dest_tmp puts repo contents into dest_tmp
+        repo_src = dest_tmp
+        for name in os.listdir(repo_src):
+            if _skip_sync(name):
+                continue
+            src_path = os.path.join(repo_src, name)
+            dst_path = os.path.join(project_abs, name)
+            try:
+                if os.path.isdir(src_path):
+                    if os.path.exists(dst_path):
+                        _merge_tree(src_path, dst_path)
+                    else:
+                        shutil.copytree(src_path, dst_path)
+                else:
+                    shutil.copy2(src_path, dst_path)
+            except (OSError, shutil.Error) as e:
+                logging.warning('Sync item %s failed: %s', name, e)
+        print(f'Synced container /repo -> {project_abs}')
+
+
+def _commit_container(container_id: str, task_id: str) -> str | None:
+    """Commit container state as a Docker image. Returns image tag or None."""
+    sid = safe_task_id(task_id, fallback='unknown_task')
+    image_tag = f'ae-agent-{sid.lower()}:latest'
+    if not _run_docker_cmd(
+        ['docker', 'commit', container_id, image_tag],
+        timeout=600,
+        on_fail_message='docker commit failed',
+    ):
+        return None
+    return image_tag
+
+
+def _stop_container(container_id: str) -> bool:
+    """Stop a Docker container. Returns True if stopped successfully."""
+    return _run_docker_cmd(
+        ['docker', 'stop', container_id],
+        timeout=60,
+        on_success_message=f'Stopped container {container_id}.',
+        on_fail_message='docker stop failed',
+    )
 
-    _validate_agent_path(agent_path)
 
-    await deployment.start()
-    runtime = deployment.runtime
+def _save_container(
+    container_id: str,
+    project_path: str,
+    task_id: str,
+) -> tuple[str | None, bool]:
+    """Sync workspace, commit image, and stop container."""
+    _sync_workspace(container_id, project_path)
+    image_tag = _commit_container(container_id, task_id)
+    stopped = _stop_container(container_id)
+    return image_tag, stopped
 
-    timeout_ms_env = os.environ.get("BASH_MAX_TIMEOUT_MS")
+
+async def _get_container_id(runtime) -> str | None:
+    """Get container hostname/ID from inside the container."""
     try:
-        timeout_s = float(timeout_ms_env) / 1000.0 if timeout_ms_env else (DEFAULT_TIMEOUT_MS / 1000.0)
-    except (ValueError, TypeError):
-        timeout_s = DEFAULT_TIMEOUT_MS / 1000.0
+        cid = (
+            await _run_bash(
+                runtime,
+                'cat /etc/hostname 2>/dev/null || hostname 2>/dev/null || echo ""',
+                10.0,
+            )
+        ).strip()
+        return cid if cid and cid != 'unknown' else None
+    except (AttributeError, TypeError, ValueError) as e:
+        logging.debug('Could not get container ID: %s', e)
+        return None
 
-    if hasattr(runtime, "_config"):
-        print(f"Current RemoteRuntime timeout: {runtime._config.timeout}s")
-        runtime._config.timeout = timeout_s
-        print(f"Overriding RemoteRuntime timeout to {timeout_s}s based on BASH_MAX_TIMEOUT_MS")
 
-    await runtime.create_session(CreateBashSessionRequest())
+def _shell_escape(s: str) -> str:
+    """Escape a string for use inside single-quoted shell arguments."""
+    return s.replace("'", "'\"'\"'")
+
 
-    print('Uploading project files...')
-    await runtime.upload(
-        UploadRequest(
-            source_path=project_path,
-            target_path='/repo',
+def _build_api_env_dict(timeout_ms: int) -> dict[str, str]:
+    """Build env vars dict for API keys, Foundry, and timeouts.
+
+    Single source of truth for _docker_exec_env_args and _setup_container_env.
+    """
+    api_key = os.environ.get('ANTHROPIC_API_KEY')
+    foundry_key = os.environ.get('ANTHROPIC_FOUNDRY_API_KEY')
+    env = dict(timeout_env_dict(timeout_ms))
+    if api_key:
+        env['ANTHROPIC_API_KEY'] = api_key
+    if foundry_key:
+        env['ANTHROPIC_FOUNDRY_API_KEY'] = foundry_key
+        if not api_key:
+            env['ANTHROPIC_API_KEY'] = foundry_key
+    foundry_url = os.environ.get('ANTHROPIC_FOUNDRY_BASE_URL')
+    if foundry_url:
+        env['ANTHROPIC_FOUNDRY_BASE_URL'] = foundry_url
+    if os.environ.get('CLAUDE_CODE_USE_FOUNDRY') == '1':
+        env['CLAUDE_CODE_USE_FOUNDRY'] = '1'
+    return env
+
+
+def _docker_exec_env_args(timeout_ms: int) -> list[str]:
+    """Build -e VAR=value args for docker exec (env vars needed by runner.py)."""
+    env = _build_api_env_dict(timeout_ms)
+    args = []
+    for k, v in env.items():
+        args.extend(['-e', f'{k}={v}'])
+    return args
+
+
+async def _upload_task(runtime, task: str, task_file_path: str | None):
+    """Upload task description to /agent/current_task.txt inside container."""
+    tmpdir = tempfile.mkdtemp(prefix='ae_task_')
+    try:
+        dest = os.path.join(tmpdir, 'current_task.txt')
+        if task_file_path and os.path.isfile(task_file_path):
+            shutil.copy2(task_file_path, dest)
+        else:
+            with open(dest, 'w', encoding='utf-8') as f:
+                f.write(task)
+        await runtime.upload(UploadRequest(source_path=tmpdir, target_path='/agent_task_file'))
+        await _run_bash(
+            runtime,
+            'cp /agent_task_file/current_task.txt /agent/current_task.txt',
+            10.0,
         )
-    )
-    print('Project files uploaded.')
+    finally:
+        shutil.rmtree(tmpdir, ignore_errors=True)
 
-    is_ae_agent = 'ae_agent' in str(agent_path) or str(agent_path).endswith('claude_sdk')
 
-    await runtime.run_in_session(BashAction(command='cd /repo'))
-    pwd_result = await runtime.run_in_session(BashAction(command='pwd'))
-    print(f'Current directory: {pwd_result}')
-    ls_result = await runtime.run_in_session(BashAction(command='ls'))
-    print(f'Current directory contents: {ls_result}')
+async def _setup_container_env(runtime, timeout_ms: int):
+    """Set timeout and API keys inside the container."""
+    env = _build_api_env_dict(timeout_ms)
+    parts = [f"export {k}='{_shell_escape(v)}'" for k, v in env.items()]
+    await _run_bash(runtime, ' && '.join(parts))
 
-    print('Uploading agent runner script...')
-    await runtime.upload(
-        UploadRequest(
-            source_path=agent_path,
-            target_path='/agent',
-        )
+    if not has_api_key():
+        logging.warning('No API key found. Runner may fail.')
+
+
+def _extract_output(res) -> str:
+    """Extract output string from swe-rex/bash action result."""
+    return str(getattr(res, 'output', '')).strip()
+
+
+async def _run_bash(runtime, command: str, timeout: float = 10.0) -> str:
+    """Run a Bash command in the container session and return its output. Reduces duplication."""
+    res = await runtime.run_in_session(BashAction(command=command, timeout=timeout))
+    return _extract_output(res)
+
+
+async def _start_runner_background(runtime, model: str) -> str | None:
+    """Start runner.sh in background, return pid or None."""
+    await _run_bash(
+        runtime,
+        'rm -f /agent/runner.live.log && touch /agent/runner.live.log',
+        10.0,
     )
-    print('Agent runner script uploaded.')
+    output = await _run_bash(
+        runtime,
+        (
+            f'stdbuf -oL -eL /agent/runner.sh "{model}" /agent/current_task.txt '
+            f'> /agent/runner.live.log 2>&1 & '
+            f'RUNNER_PID=$!; sleep 1; echo RUNNER_PID=$RUNNER_PID'
+        ),
+        30.0,
+    )
+    pid = None
+    for line in output.split('\n'):
+        if 'RUNNER_PID=' in line:
+            pid = line.split('RUNNER_PID=', 1)[1].strip()
+            break
+    if not pid or not pid.strip().isdigit():
+        await asyncio.sleep(2)
+        pid = await _run_bash(
+            runtime,
+            "ps aux | grep '[r]unner.py' | awk '{print $2}' | head -1",
+            10.0,
+        )
+    pid = (pid or '').strip()
+    return pid if pid.isdigit() else None
 
-    print('Setup the agent running environment...')
-    await runtime.run_in_session(BashAction(command='chmod +x /agent/runner.sh /agent/install.sh 2>/dev/null; /agent/install.sh'))
 
-    if task_file_path and os.path.isfile(task_file_path):
-        tmpdir = tempfile.mkdtemp(prefix='ae_agent_task_')
-        try:
-            dest = os.path.join(tmpdir, 'current_task.txt')
-            shutil.copy2(task_file_path, dest)
-            await runtime.upload(UploadRequest(source_path=tmpdir, target_path='/agent_task_file'))
-            await runtime.run_in_session(BashAction(command='cp /agent_task_file/current_task.txt /agent/current_task.txt', timeout=10.0))
-        finally:
-            shutil.rmtree(tmpdir, ignore_errors=True)
+async def _read_runner_log(runtime, elapsed: float, last_log: str) -> str:
+    """Read live log and print new content. Returns updated last_log."""
+    try:
+        cur = await _run_bash(runtime, 'cat /agent/runner.live.log 2>/dev/null || echo ""', 30.0)
+        if cur and cur != last_log:
+            new = cur[len(last_log) :].strip() if cur.startswith(last_log) else cur
+            if new:
+                print(f'[log @ {elapsed:.0f}s]\n{new}', flush=True)
+            return cur
+    except (AttributeError, TypeError, ValueError) as e:
+        logging.debug('Log read error: %s', e)
+    return last_log
+
+
+async def _check_runner_exited(runtime, pid: str | None) -> _RunnerResult | None:
+    """Check if runner process has exited. Returns _RunnerResult if exited, else None."""
+    if pid and pid.isdigit():
+        ps_out = await _run_bash(runtime, f'ps -p {pid} >/dev/null 2>&1; echo $?', 10.0)
+        if ps_out.strip() != '0':
+            code = await _run_bash(runtime, f'wait {pid} 2>/dev/null; echo $?', 30.0)
+            ec = int(code.strip()) if code.strip().isdigit() else -1
+            return _RunnerResult(exit_code=ec, output=f'exit_code={ec}')
     else:
-        tmpdir = tempfile.mkdtemp(prefix='ae_agent_task_')
+        # PID was never captured (e.g. RUNNER_PID parse failed); detect exit by process count.
+        cnt = await _run_bash(runtime, "ps aux | grep '[r]unner.py' | wc -l", 10.0)
+        if not cnt.strip().isdigit() or int(cnt.strip()) == 0:
+            return _RunnerResult(exit_code=-1, output='exit_code=unknown')
+    return None
+
+
+async def _handle_runner_timeout(runtime, pid: str | None) -> None:
+    """Kill runner and print log tail on timeout."""
+    if pid and pid.isdigit():
         try:
-            task_file_host = os.path.join(tmpdir, 'current_task.txt')
-            with open(task_file_host, 'w', encoding='utf-8') as f:
-                f.write(task)
-            await runtime.upload(UploadRequest(source_path=tmpdir, target_path='/agent_task_file'))
-            await runtime.run_in_session(BashAction(command='cp /agent_task_file/current_task.txt /agent/current_task.txt', timeout=10.0))
-        finally:
-            shutil.rmtree(tmpdir, ignore_errors=True)
-
-    if timeout_ms_env:
-        set_timeout_cmd = (
-            f"export BASH_MAX_TIMEOUT_MS='{timeout_ms_env}' && "
-            f"export BASH_DEFAULT_TIMEOUT_MS='{timeout_ms_env}'"
-        )
-        print(f"Setting BASH_MAX_TIMEOUT_MS/BASH_DEFAULT_TIMEOUT_MS in container to {timeout_ms_env} ms...")
-        await runtime.run_in_session(BashAction(command=set_timeout_cmd))
-
-    if is_ae_agent:
-        parts = []
-        anthropic_api_key = os.environ.get('ANTHROPIC_API_KEY')
-        foundry_api_key = os.environ.get('ANTHROPIC_FOUNDRY_API_KEY')
-        if anthropic_api_key:
-            escaped_key = anthropic_api_key.replace("'", "'\"'\"'")
-            parts.append(f"export ANTHROPIC_API_KEY='{escaped_key}'")
-        if foundry_api_key:
-            escaped_foundry_key = foundry_api_key.replace("'", "'\"'\"'")
-            parts.append(f"export ANTHROPIC_FOUNDRY_API_KEY='{escaped_foundry_key}'")
-            if not anthropic_api_key:
-                parts.append(f"export ANTHROPIC_API_KEY='{escaped_foundry_key}'")
-        foundry_base = os.environ.get('ANTHROPIC_FOUNDRY_BASE_URL')
-        if foundry_base:
-            escaped_url = foundry_base.replace("'", "'\"'\"'")
-            parts.append(f"export ANTHROPIC_FOUNDRY_BASE_URL='{escaped_url}'")
-        if os.environ.get('CLAUDE_CODE_USE_FOUNDRY') == '1':
-            parts.append("export CLAUDE_CODE_USE_FOUNDRY=1")
-        if parts:
-            set_env_cmd = " && ".join(parts)
-            print('Setting Anthropic/Foundry API key and env in container...')
-            await runtime.run_in_session(BashAction(command=set_env_cmd))
-        if not anthropic_api_key and not foundry_api_key:
-            print('WARNING: Neither ANTHROPIC_API_KEY nor ANTHROPIC_FOUNDRY_API_KEY found on host. Runner may fail.')
-
-    print('Running runner script...')
-    runner_timeout = timeout_s if is_ae_agent else min(timeout_s, 1200.0)
-
-    container_id_early = None
-    try:
-        container_id_res = await runtime.run_in_session(
-            BashAction(
-                command='cat /etc/hostname 2>/dev/null || hostname 2>/dev/null || echo "unknown"',
-                timeout=10.0,
+            await _run_bash(
+                runtime,
+                f'kill -TERM {pid} 2>/dev/null || kill -9 {pid} 2>/dev/null || true',
+                10.0,
             )
-        )
-        container_id_early = str(getattr(container_id_res, "output", "")).strip()
-        if container_id_early == "unknown":
-            container_id_early = None
-    except Exception as e:
-        print(f"WARNING: Failed to get container id early (will retry after runner): {e}")
+        except (AttributeError, TypeError, ConnectionError) as e:
+            logging.debug('Kill runner failed: %s', e)
+    try:
+        tail_str = await _run_bash(runtime, 'tail -n 200 /agent/runner.live.log', 30.0)
+        print(f'Log tail (timeout):\n{tail_str}', flush=True)
+    except (AttributeError, TypeError, ValueError) as e:
+        logging.debug('Could not read log tail: %s', e)
+
+
+async def _monitor_runner(runtime, model: str, timeout_s: float) -> _RunnerResult:
+    """Start runner.sh in background and poll logs until it finishes or times out."""
+    pid = await _start_runner_background(runtime, model)
+    print(f'Runner started (pid={pid})', flush=True)
+
+    start = time.monotonic()
+    last_log = ''
+    last_progress_at = 0.0
+
+    while True:
+        elapsed = time.monotonic() - start
+        if elapsed >= timeout_s:
+            break
+
+        last_log = await _read_runner_log(runtime, elapsed, last_log)
+        if elapsed - last_progress_at >= _PROGRESS_LOG_INTERVAL_SEC:
+            print(f'[still running @ {elapsed:.0f}s]', flush=True)
+            last_progress_at = elapsed
+
+        result = await _check_runner_exited(runtime, pid)
+        if result is not None:
+            print(f'Runner finished (exit_code={result.exit_code})', flush=True)
+            return result
+
+        await asyncio.sleep(_POLL_INTERVAL_SEC)
+
+    await _handle_runner_timeout(runtime, pid)
+    raise TimeoutError(f'Runner exceeded timeout {timeout_s}s')
+
+
+# ---------------------------------------------------------------------------
+# Docker mode
+# ---------------------------------------------------------------------------
+
+
+async def _run_interactive_in_container(
+    container_id: str,
+    task_id: str,
+    task: str,
+    project_path: str,
+    model: str,
+    timeout_ms: int,
+) -> dict:
+    """Run task + interactive in foreground via docker exec -it.
 
+    The same agent session handles both task and follow-up, preserving context.
+    """
+    print(
+        '\n'
+        + '=' * 60
+        + '\nTask + interactive mode (foreground, context preserved).\n'
+        + "Type 'quit' or 'exit' to end the interactive session.\n"
+        + '=' * 60,
+        flush=True,
+    )
+    exec_args = [
+        'docker',
+        'exec',
+        '-it',
+        *_docker_exec_env_args(timeout_ms),
+        container_id,
+        'python3',
+        '-u',
+        '/agent/runner.py',
+        model,
+        '/agent/current_task.txt',
+        '--interactive',
+    ]
     try:
-        if is_ae_agent:
-            await runtime.run_in_session(BashAction(command='rm -f /agent/runner.live.log && touch /agent/runner.live.log', timeout=10.0))
-
-            start_cmd = (
-                'stdbuf -oL -eL /agent/runner.sh "' + model + '" /agent/current_task.txt > /agent/runner.live.log 2>&1 & '
-                'RUNNER_PID=$!; '
-                'sleep 1; '
-                'echo RUNNER_PID=$RUNNER_PID'
-            )
-            start_res = await runtime.run_in_session(BashAction(command=start_cmd, timeout=30.0))
-            start_output = str(getattr(start_res, "output", "")).strip()
-
-            pid = None
-            for line in start_output.split('\n'):
-                if 'RUNNER_PID=' in line:
-                    pid = line.split('RUNNER_PID=', 1)[1].strip()
-                    break
-
-            if not pid or not pid.isdigit():
-                await asyncio.sleep(2)
-                ps_res = await runtime.run_in_session(
-                    BashAction(command="ps aux | grep '[r]unner.py' | awk '{print $2}' | head -1", timeout=10.0)
-                )
-                pid = str(getattr(ps_res, "output", "")).strip()
-
-            print(f'ae-agent runner started with pid: {pid}')
-            await asyncio.sleep(2)
-
-            elapsed = 0.0
-            poll_interval = 10.0
-            run_results = None
-            last_log_content = ""
-
-            while elapsed < runner_timeout:
-                try:
-                    log_res = await runtime.run_in_session(
-                        BashAction(command='cat /agent/runner.live.log 2>/dev/null || echo ""', timeout=30.0)
-                    )
-                    current_log_content = str(getattr(log_res, "output", "")).strip()
-
-                    if current_log_content and current_log_content != last_log_content:
-                        if last_log_content and current_log_content.startswith(last_log_content):
-                            new_content = current_log_content[len(last_log_content):].strip()
-                            if new_content:
-                                print(f'[ae-agent live log @ {elapsed:.0f}s ({elapsed/60:.1f} min)]\n{new_content}')
-                        else:
-                            print(f'[ae-agent live log @ {elapsed:.0f}s ({elapsed/60:.1f} min)]\n{current_log_content}')
-                        last_log_content = current_log_content
-                    elif elapsed % 300 == 0 and elapsed > 0:
-                        print(f'[ae-agent still running @ {elapsed:.0f}s ({elapsed/60:.1f} min), no new output]')
-                except Exception as e:
-                    print(f'Failed to read ae-agent live log: {e}')
-
-                if pid and pid.isdigit():
-                    ps_res = await runtime.run_in_session(
-                        BashAction(command=f'ps -p {pid} >/dev/null 2>&1; echo $?', timeout=10.0)
-                    )
-                    ps_code = str(getattr(ps_res, "output", "")).strip()
-                    if ps_code != "0":
-                        wait_res = await runtime.run_in_session(
-                            BashAction(command=f'wait {pid} 2>/dev/null; echo $?', timeout=30.0)
-                        )
-                        exit_code_str = str(getattr(wait_res, "output", "")).strip()
-                        class MockResult:
-                            def __init__(self, code):
-                                self.exit_code = int(code) if code.isdigit() else 0
-                                self.output = f'exit_code={self.exit_code}'
-                        run_results = MockResult(exit_code_str)
-                        print(f'ae-agent runner finished with exit code: {run_results.exit_code}')
-                        break
-                else:
-                    ps_res = await runtime.run_in_session(
-                        BashAction(command="ps aux | grep '[r]unner.py' | wc -l", timeout=10.0)
-                    )
-                    proc_count = str(getattr(ps_res, "output", "")).strip()
-                    if proc_count == "0" or not proc_count.isdigit() or int(proc_count) == 0:
-                        print('ae-agent runner process not found, assuming finished')
-                        class MockResult:
-                            def __init__(self):
-                                self.exit_code = 0
-                                self.output = 'exit_code=0'
-                        run_results = MockResult()
-                        break
-
-                await asyncio.sleep(poll_interval)
-                elapsed += poll_interval
-
-            if run_results is None:
-                if pid and pid.isdigit():
-                    try:
-                        await runtime.run_in_session(BashAction(command=f'kill -TERM {pid} 2>/dev/null || kill -9 {pid} 2>/dev/null || true', timeout=10.0))
-                    except Exception:
-                        pass
-                try:
-                    tail_log = await runtime.run_in_session(
-                        BashAction(command='tail -n 200 /agent/runner.live.log', timeout=30.0)
-                    )
-                    print(f'ae-agent live log tail (on timeout):\n{tail_log}')
-                except Exception as e:
-                    print(f'Failed to read ae-agent live log after timeout: {e}')
-                raise TimeoutError(f'ae-agent runner exceeded timeout {runner_timeout}s')
+        proc = await asyncio.to_thread(
+            subprocess.run,
+            exec_args,
+            stdin=sys.__stdin__,
+            stdout=sys.__stdout__,
+            stderr=sys.__stderr__,
+        )
+        run_exit_code = proc.returncode
+    except (OSError, subprocess.SubprocessError) as e:
+        logging.warning('Foreground execution failed for task %s: %s', task_id, e)
+        run_exit_code = 1
+
+    return _make_eval_result(
+        task_id,
+        task,
+        project_path,
+        f'Interactive session (exit_code={run_exit_code})',
+        status_from_exit_code(run_exit_code),
+        run_on_host=False,
+    )
 
-        else:
-            runner_cmd = '/agent/runner.sh "' + model + '" /agent/current_task.txt'
-            run_results = await runtime.run_in_session(BashAction(command=runner_cmd, timeout=runner_timeout))
-
-        print(f"agent's run results: {run_results}")
-        print('Runner script finished.')
-
-        result = {
-            'task_id': task_id,
-            'task': task,
-            'project_path': project_path,
-            'agent_run_results': run_results.output if hasattr(run_results, 'output') else str(run_results),
-            'status': 'success' if (hasattr(run_results, 'exit_code') and run_results.exit_code == 0) else 'error',
-            'run_on_host': False,
-        }
-
-        container_id = container_id_early
-        if not container_id or container_id == "unknown":
-            try:
-                container_id_res = await runtime.run_in_session(
-                    BashAction(
-                        command='cat /etc/hostname 2>/dev/null || hostname 2>/dev/null || echo "unknown"',
-                        timeout=10.0,
-                    )
-                )
-                container_id = str(getattr(container_id_res, "output", "")).strip()
-            except Exception as e:
-                print(f"WARNING: Failed to get container id from inside container: {e}")
 
-        saved_image = None
-        container_stopped = False
+async def _run_in_docker(  # noqa: C901
+    deployment,
+    project_path,
+    task_id,
+    task,
+    model,
+    agent_path,
+    _save_path: str,
+    timeout_ms: int,
+    *,
+    task_file_path: str | None = None,
+    interactive: bool = False,
+) -> dict:
+    """Run task inside a Docker container.
 
-        if interactive and container_id and container_id != "unknown":
-            print("\n" + "=" * 60)
-            print("Interactive mode - Attaching to container. Type instructions (or 'quit'/'exit' to end).")
-            print("=" * 60 + "\n")
-            try:
-                proc = subprocess.run(
-                    ["docker", "exec", "-it", container_id, "python3", "/agent/interactive_runner.py", model],
-                    stdin=sys.stdin,
-                    stdout=sys.stdout,
-                    stderr=sys.stderr,
-                )
-                if proc.returncode != 0:
-                    print(f"Interactive session exited with code {proc.returncode}", file=sys.stderr)
-            except Exception as e:
-                print(f"WARNING: Interactive mode failed: {e}", file=sys.stderr)
+    _save_path: Unused in Docker path (results are returned to main.py which writes reports).
+    Kept for a consistent run_eval() → _run_in_docker() API.
+    """
+    if not SWEREX_AVAILABLE:
+        raise RuntimeError('swerex is not available.')
 
-        if container_id and container_id != "unknown":
-            print(f"Preparing to save Docker container {container_id} as an image and stop it...")
-            saved_image, container_stopped = _save_container_as_image(container_id, project_path, task_id)
+    _validate_agent_path(agent_path)
+    await deployment.start()
+    runtime = deployment.runtime
 
-        try:
-            await deployment.stop()
-        except Exception as e:
-            print(f"WARNING: Failed to stop deployment cleanly: {e}")
+    timeout_s = timeout_ms / 1000.0
+    # swe-rex doesn't expose a public API for session-level timeout;
+    # override the internal config as a workaround.
+    if hasattr(runtime, '_config'):
+        runtime._config.timeout = timeout_s
 
-        result['container_id'] = container_id
-        result['saved_image'] = saved_image
-        result['container_stopped'] = container_stopped
+    await runtime.create_session(CreateBashSessionRequest())
 
-        return result
+    print('Uploading project files...', flush=True)
+    await runtime.upload(UploadRequest(source_path=project_path, target_path='/repo'))
+    await _run_bash(runtime, 'cd /repo')
 
+    print('Uploading agent scripts...', flush=True)
+    await runtime.upload(UploadRequest(source_path=agent_path, target_path='/agent'))
+    await _run_bash(
+        runtime,
+        'chmod +x /agent/runner.sh /agent/install.sh 2>/dev/null; /agent/install.sh',
+        120.0,  # install.sh may run pip install; allow up to 2 minutes
+    )
+
+    await _upload_task(runtime, task, task_file_path)
+    await _setup_container_env(runtime, timeout_ms)
+
+    container_id = await _get_container_id(runtime)
+    result = None
+
+    try:
+        # Prefer foreground interactive when container_id is available and stdin is a TTY.
+        if interactive and container_id and _stdin_is_tty():
+            result = await _run_interactive_in_container(container_id, task_id, task, project_path, model, timeout_ms)
+        else:
+            if interactive and not _stdin_is_tty():
+                print(
+                    'WARNING: Interactive mode requires a terminal (TTY). Running task in non-interactive mode.',
+                    flush=True,
+                )
+            elif interactive and not container_id:
+                print(
+                    'WARNING: Cannot get container ID; falling back to non-interactive mode.',
+                    flush=True,
+                )
+            # Background run: start runner, poll logs, then return result.
+            run_results = await _monitor_runner(runtime, model, timeout_s)
+            print(f'Runner result: {run_results}', flush=True)
+            result = _make_eval_result(
+                task_id,
+                task,
+                project_path,
+                run_results.output,
+                status_from_exit_code(run_results.exit_code),
+                run_on_host=False,
+            )
     except Exception as e:
-        print(f"Task ended with error: {e}")
-        result = {
-            'task_id': task_id,
-            'task': task,
-            'project_path': project_path,
-            'agent_run_results': str(e),
-            'status': 'error',
-            'run_on_host': False,
-            'container_id': None,
-            'saved_image': None,
-            'container_stopped': False,
-        }
-        if container_id_early and container_id_early != "unknown":
-            print("Attempting to save container as image (abnormal exit path)...")
+        logging.error('Task %s error: %s', task_id, e, exc_info=True)
+        result = _make_eval_result(
+            task_id,
+            task,
+            project_path,
+            str(e),
+            'error',
+            run_on_host=False,
+        )
+    finally:
+        if not container_id:
+            container_id = await _get_container_id(runtime)
+
+        saved_image, stopped = None, False
+        if container_id:
             try:
-                saved_img, stopped = _save_container_as_image(container_id_early, project_path, task_id)
-                result['container_id'] = container_id_early
-                result['saved_image'] = saved_img
-                result['container_stopped'] = stopped
-            except Exception as save_e:
-                print(f"WARNING: Failed to save image on abnormal exit: {save_e}")
+                saved_image, stopped = _save_container(container_id, project_path, task_id)
+            except (OSError, subprocess.SubprocessError) as e:
+                logging.warning('Save container failed: %s', e)
+
         try:
             await deployment.stop()
-        except Exception as stop_e:
-            print(f"WARNING: Failed to stop deployment cleanly: {stop_e}")
-        return result
+        except Exception as e:
+            # Container may already be stopped; deployment.close() can fail with
+            # ClientConnectorError when the remote service port is gone.
+            logging.warning('deployment.stop() failed for task %s: %s', task_id, e)
+
+        if result is None:
+            # Exception occurred before any result was set (e.g. before try body ran
+            # or a BaseException was raised). Ensure we always have a dict for update/return.
+            result = _make_eval_result(
+                task_id,
+                task,
+                project_path,
+                'Execution interrupted or failed before result was set.',
+                'error',
+                run_on_host=False,
+            )
+        result.update(
+            container_id=container_id,
+            saved_image=saved_image,
+            container_stopped=stopped,
+        )
+
+    return result
+
+
+# ---------------------------------------------------------------------------
+# Public entry point
+# ---------------------------------------------------------------------------
 
 
 def run_eval(
@@ -634,64 +884,60 @@ def run_eval(
     use_gpu: bool = False,
     task_file_path: str | None = None,
     interactive: bool = False,
-):
-    """Run task in the given environment: local (host) or docker.
-
-    Single entry point for one-task execution. Call this from main.
-
-    Args:
-        env: 'local' = run on host; otherwise run in Docker (value can be image name).
-        project_path: Path to the artifact project
-        task_id: Task identifier
-        task: Task description (used when task_file_path is None)
-        model: Model name
-        agent_path: Path to agent scripts
-        save_path: Path to save results
-        docker_image: Docker image (used when env != 'local'); default if None.
-        timeout_ms: Optional total timeout in milliseconds for this task
-        skip_prereq_check: If True (host only), skip docker/API-key check before running
-        use_gpu: If True (Docker only), pass host GPU into container via --gpus all
-        task_file_path: If set, upload this file as task (avoids passing large string)
-        interactive: If True, after task completes user can continue giving agent instructions
-    """
-    if timeout_ms is None:
-        timeout_ms = DEFAULT_TIMEOUT_MS
-    os.environ["BASH_MAX_TIMEOUT_MS"] = str(timeout_ms)
-    os.environ["BASH_DEFAULT_TIMEOUT_MS"] = str(timeout_ms)
+) -> dict:
+    """Run task in the given environment (local host or Docker).
 
-    if str(env).strip().lower() == "local":
-        print(f"Task {task_id} configured to run on HOST (env=local, timeout_ms={timeout_ms}, interactive={interactive})")
+    Single entry point — called from main.py for each JSONL task.
+    """
+    timeout_ms = resolve_timeout_ms(timeout_ms)
+    if is_local_env(env):
+        apply_timeout_env(timeout_ms)  # Docker mode uses container env only; no host env.
+        print(f'Task {task_id}: HOST (timeout={timeout_ms}ms, interactive={interactive})')
         return asyncio.run(
             _run_local(
-                project_path, task_id, task, model, agent_path, save_path, timeout_ms,
-                skip_prereq_check=skip_prereq_check, interactive=interactive,
+                project_path,
+                task_id,
+                task,
+                model,
+                timeout_ms,
+                skip_prereq_check=skip_prereq_check,
+                interactive=interactive,
             )
         )
 
     if not SWEREX_AVAILABLE:
-        raise RuntimeError(
-            "SWE-ReX (swerex) is not available. Install swe-rex for Docker mode."
-        )
-    image = docker_image or 'bastoica/ae-agent-ubuntu24.04:latest'
+        raise RuntimeError('SWE-ReX not available. Install swe-rex for Docker mode.')
+
+    image = docker_image or DEFAULT_DOCKER_IMAGE
     docker_args = [
         '--privileged',
         '--cgroupns=host',
-        '-e', 'KIND_EXPERIMENTAL_CONTAINERD_SNAPSHOTTER=native',
+        '-e',
+        'KIND_EXPERIMENTAL_CONTAINERD_SNAPSHOTTER=native',
     ]
     if use_gpu:
         docker_args.extend(['--gpus', 'all'])
+
     config = DockerDeploymentConfig(
         image=image,
         startup_timeout=1200.0,
         docker_args=docker_args,
     )
-    deployment_obj = config.get_deployment()
-    gpu_note = " (GPU enabled)" if use_gpu else ""
-    interactive_note = " (interactive)" if interactive else ""
-    print(f"Task {task_id} configured to run in DOCKER (image={image}, timeout_ms={timeout_ms}){gpu_note}{interactive_note}")
+    deployment = config.get_deployment()
+
+    gpu_note = ' (GPU)' if use_gpu else ''
+    print(f'Task {task_id}: DOCKER (image={image}, timeout={timeout_ms}ms){gpu_note}')
     return asyncio.run(
-        run_eval_in_env(
-            deployment_obj, project_path, task_id, task, model, agent_path, save_path,
-            task_file_path=task_file_path, interactive=interactive,
+        _run_in_docker(
+            deployment,
+            project_path,
+            task_id,
+            task,
+            model,
+            agent_path,
+            save_path,
+            timeout_ms,
+            task_file_path=task_file_path,
+            interactive=interactive,
         )
     )
diff --git a/benchmarks/arteval_bench/src/agents/ae_agent/runner.py b/benchmarks/arteval_bench/src/agents/ae_agent/runner.py
index 44fcea3b..972573c2 100644
--- a/benchmarks/arteval_bench/src/agents/ae_agent/runner.py
+++ b/benchmarks/arteval_bench/src/agents/ae_agent/runner.py
@@ -1,248 +1,460 @@
 #!/usr/bin/env python3
-"""AE Agent runner for ArtEvalBench - Claude Agent SDK for artifact tasks.
+"""Core agent execution using Claude Agent SDK.
 
-Runs inside benchmark container: artifact at /repo, agent at /agent; task as CLI arg or path to file.
+Works both as a package module (imported by run_eval for host mode) and as a
+standalone script (uploaded to Docker container and run via runner.sh).
+
+Provides:
+- build_system_prompt(): unified prompt builder for all environments
+- run_agent(): single implementation of SDK invocation with rate-limit retry
+- docker_main(): standalone Docker entry point
 """
 
+from __future__ import annotations
+
 import asyncio
+import logging
 import os
+import re
 import sys
 
-sys.path.insert(0, '/agent')
+logger = logging.getLogger(__name__)
 
+# Import utils: as package module or standalone in Docker.
 try:
-    from utils import DEFAULT_TIMEOUT_MS
-except ImportError:
-    DEFAULT_TIMEOUT_MS = 172_800_000  # 48h fallback
+    from .utils import (
+        DEFAULT_MODEL,
+        DEFAULT_TIMEOUT_MS,
+        has_api_key,
+        is_local_env,
+        resolve_timeout_ms,
+    )
+except (ImportError, SystemError):
+    sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+    try:
+        from utils import (
+            DEFAULT_MODEL,
+            DEFAULT_TIMEOUT_MS,
+            has_api_key,
+            is_local_env,
+            resolve_timeout_ms,
+        )
+    except ImportError:
+        # Fallback when utils is not importable (e.g. container has only runner.py).
+        # Duplication intentional; single source is utils.py. Update both if default changes.
+        DEFAULT_TIMEOUT_MS = 345_600_000  # 96h
+        DEFAULT_MODEL = 'claude-sonnet-4-5-20250929'
+
+        def is_local_env(env: str) -> bool:  # noqa: D103
+            return str(env).strip().lower() == 'local'
+
+        def has_api_key() -> bool:  # noqa: D103
+            return bool(os.environ.get('ANTHROPIC_API_KEY') or os.environ.get('ANTHROPIC_FOUNDRY_API_KEY'))
+
+        def resolve_timeout_ms(timeout_ms: int | None) -> int:  # noqa: D103
+            return timeout_ms if timeout_ms is not None else DEFAULT_TIMEOUT_MS
+
 
 try:
-    from claude_agent_sdk import query, ClaudeAgentOptions
+    from claude_agent_sdk import ClaudeAgentOptions, query
+
     CLAUDE_SDK_AVAILABLE = True
-except ImportError as e:
-    print(f"ERROR: Failed to import claude_agent_sdk: {e}", file=sys.stderr)
+except ImportError:
     CLAUDE_SDK_AVAILABLE = False
 
 try:
-    from message_formatter import MessageFormatter
-    FORMATTER_AVAILABLE = True
+    from claude_agent_sdk import ClaudeSDKClient
 except ImportError:
-    print("WARNING: message_formatter not available, will use basic output.", file=sys.stderr)
-    FORMATTER_AVAILABLE = False
+    ClaudeSDKClient = None
 
-if not CLAUDE_SDK_AVAILABLE:
-    print("ERROR: claude_agent_sdk is not available.", file=sys.stderr)
-    sys.exit(1)
+_RATE_LIMIT_MAX_RETRIES = 5
+_RATE_LIMIT_WAIT_SEC = 60
+_RATE_LIMIT_WAIT_MAX_SEC = 600
+_RATE_LIMIT_WRAPPED_MAX_RETRIES = 3
+_PROGRESS_LOG_INTERVAL = 10
 
-RATE_LIMIT_MAX_RETRIES = 5
-RATE_LIMIT_WAIT_SEC = 60
-RATE_LIMIT_WAIT_MAX_SEC = 600
-RATE_LIMIT_WRAPPED_MAX_RETRIES = 3
+
+_RESULT_TYPE_NAMES = frozenset({'ResultMessage', 'TextBlock'})
+
+
+def _process_message(message, message_count: int, result_text: str) -> tuple[int, str]:
+    """Process one SDK message: print, update count, extract result text.
+
+    Returns (new_message_count, new_result_text).
+    """
+    message_count += 1
+    if message_count % _PROGRESS_LOG_INTERVAL == 0:
+        print(f'[Progress] {message_count} messages...', flush=True)
+    msg_str = str(message)
+    print(msg_str, flush=True)
+    if type(message).__name__ in _RESULT_TYPE_NAMES:
+        result_text = msg_str
+    return message_count, result_text
 
 
 def _is_rate_limit_error(exc: BaseException) -> bool:
     msg = str(exc).lower()
-    return "429" in msg or "rate limit" in msg or "ratelimitreached" in msg
+    return '429' in msg or 'rate limit' in msg or 'ratelimitreached' in msg
 
 
 def _is_possible_wrapped_rate_limit(exc: BaseException) -> bool:
-    msg = str(exc)
-    return ("command failed" in msg.lower() and "exit code 1" in msg.lower()) or "check stderr" in msg.lower()
+    msg = str(exc).lower()
+    return ('command failed' in msg and 'exit code 1' in msg) or 'check stderr' in msg
 
 
-def _parse_retry_after_seconds(exc: BaseException) -> int | None:
-    import re
-    m = re.search(r"wait\s+(\d+)\s*seconds", str(exc), re.I)
+def _parse_retry_after(exc: BaseException) -> int | None:
+    m = re.search(r'wait\s+(\d+)\s*seconds', str(exc), re.I)
     return int(m.group(1)) if m else None
 
 
-def get_default_work_dir():
-    return os.environ.get('WORK_DIR', None)
+# Shared prompt fragments (avoids duplication and keeps host/docker logic aligned).
+_PROMPT_TIMEOUT_HOST = (
+    'TIMEOUT CONFIGURATION (CRITICAL):\n'
+    '- Long-running commands (builds, tests, Kind cluster creation) are expected\n'
+    '- DO NOT set short timeouts - let commands complete naturally\n\n'
+)
+_PROMPT_TIMEOUT_DOCKER = (
+    'TIMEOUT CONFIGURATION (CRITICAL):\n'
+    '- The system has been configured with a Bash timeout of {timeout_ms} ms.\n'
+    '- DO NOT specify timeout parameters in your Bash commands.\n'
+    '- Long-running commands can take hours - this is normal.\n'
+    '- If a command seems to be running long, DO NOT cancel or re-run it.\n\n'
+)
+_PROMPT_VERIFY_STEPS = (
+    'You MUST execute every verification step the README requires. Do NOT skip steps because they take a long time.\n'
+)
+
+
+def build_system_prompt(
+    task: str,
+    *,
+    env: str = 'docker',
+    artifact_path: str | None = None,
+    timeout_ms: int | None = None,
+) -> str:
+    """Build system prompt, parameterized by execution environment.
+
+    Args:
+        task: Task description text.
+        env: 'local' for host execution, anything else for Docker.
+        artifact_path: Path to artifact directory (used in host mode prompt).
+        timeout_ms: Bash timeout in ms (shown in Docker mode prompt).
+    """
+    timeout_ms = resolve_timeout_ms(timeout_ms)
+
+    if is_local_env(env):
+        path = artifact_path or '.'
+        return (
+            'You are an experienced software engineer completing an artifact task.\n\n'
+            'ENVIRONMENT SETUP (HOST MACHINE):\n'
+            '- You are running DIRECTLY on the host machine (NOT inside a Docker container)\n'
+            '- Docker daemon is already running on this host\n'
+            '- You may need sudo for some operations\n\n'
+            f'ARTIFACT LOCATION:\n'
+            f'- The artifact repository is located at: {path}\n'
+            f'- Start by changing to this directory: cd {path}\n\n'
+            f'YOUR TASK:\n{task}\n\n' + _PROMPT_TIMEOUT_HOST + 'IMPORTANT GUIDELINES:\n'
+            f'1. First, cd to {path} and examine the directory structure\n'
+            '2. Follow the README instructions step by step\n'
+            f'3. {_PROMPT_VERIFY_STEPS}'
+            "4. If you see 'sudo' in instructions, you can use it (or skip if already root)\n"
+            '5. Use the Bash tool to run commands, Read tool to inspect files\n'
+            '6. Work systematically through setup, build, and experiment execution\n'
+            '7. If you encounter errors, debug and resolve them using available tools\n'
+            "8. For Kind clusters, they will work properly since you're on the host (not DinD)"
+        )
 
+    # Docker/container: when running under arteval_bench, artifact is at /repo
+    path_hint = ''
+    if artifact_path:
+        path_hint = f'- The artifact repository is at: {artifact_path}. Change to it: cd {artifact_path}\n'
+    else:
+        path_hint = (
+            '- The artifact repository should be in the current working directory or nearby.\n'
+            '- Explore the directory structure to find the artifact repository.\n'
+        )
 
-async def run_agent(model_name: str, task_description: str):
-    work_dir_hint = get_default_work_dir()
-    work_dir_instruction = f"- You may start by checking: {work_dir_hint}\n" if work_dir_hint else ""
+    return (
+        'You are an experienced software engineer.\n\n'
+        'ENVIRONMENT SETUP:\n'
+        '- You are running inside a Docker container with root permissions.\n'
+        f'{path_hint}'
+        '- You have access to Read, Write, and Bash tools.\n\n'
+        f'YOUR TASK:\n{task}\n\n' + _PROMPT_TIMEOUT_DOCKER.format(timeout_ms=timeout_ms) + 'IMPORTANT GUIDELINES:\n'
+        '1. First, explore the current directory structure\n'
+        '2. Navigate to the artifact repository root directory\n'
+        "3. If you see 'sudo' in instructions, remove it (you already have root access)\n"
+        '4. Do NOT attempt to switch git branches\n'
+        '5. Follow the README instructions step by step\n'
+        f'6. {_PROMPT_VERIFY_STEPS}'
+        '7. Use the Bash, Read, and Write tools to complete the task\n'
+        '8. Work systematically through setup, build, and experiment execution\n'
+        '9. If you encounter errors, debug and resolve them'
+    )
 
-    try:
-        timeout_ms_env = os.environ.get("BASH_MAX_TIMEOUT_MS")
-        timeout_ms = int(timeout_ms_env) if timeout_ms_env is not None else DEFAULT_TIMEOUT_MS
-    except ValueError:
-        timeout_ms = DEFAULT_TIMEOUT_MS
-
-    base_prompt = f"""You are an experienced software engineer.
-
-ENVIRONMENT SETUP:
-- You are running inside a Docker container with root permissions.
-- The artifact repository should be in the current working directory or nearby.
-- You should explore the directory structure to find the artifact repository.
-{work_dir_instruction}- You have access to Read, Write, and Bash tools to complete the task.
-
-YOUR TASK:
-{task_description}
-
-TIMEOUT CONFIGURATION (CRITICAL):
-- The system has been configured with a default Bash timeout of {timeout_ms} ms (via BASH_MAX_TIMEOUT_MS).
-- DO NOT specify timeout parameters in your Bash commands - the system default will be used automatically.
-- Long-running commands (builds, tests, benchmarks) can take hours - this is normal and expected.
-- If a command seems to be running long, DO NOT cancel or re-run it. Wait for completion.
-
-IMPORTANT GUIDELINES:
-1. First, explore the current directory structure to understand where you are and where the artifact is located.
-2. Navigate to the artifact repository root directory.
-3. If you see 'sudo' in any instructions, remove it (you already have root access).
-4. Do NOT attempt to switch git branches (you are already on the correct branch).
-5. Follow the README instructions step by step.
-6. You MUST execute every verification step, test, or command that the README (or referenced docs like TESTBED.md) says is required for evaluation or reproduction. Do NOT skip any such step just because the README mentions that it may take a long time. Long runtimes are expected; run each verification and wait for completion.
-7. Use the Bash tool to run commands, Read tool to inspect files, and Write tool to create/modify files.
-8. Work systematically through environment setup, build/install, benchmark preparation, and experiment execution.
-9. If you encounter errors, try to debug and resolve them using the available tools.
-10. For long-running commands, let them complete naturally. Do NOT set short timeouts or interrupt them."""
+
+async def run_agent(  # noqa: C901
+    model_name: str,
+    task: str,
+    *,
+    system_prompt: str | None = None,
+    env: str = 'docker',
+    artifact_path: str | None = None,
+    timeout_ms: int | None = None,
+    interactive: bool = False,
+) -> dict:
+    """Run the agent using Claude SDK. Single implementation for all modes.
+
+    Args:
+        model_name: Claude model name (e.g. claude-sonnet-4-5-20250929)
+        task: Task description
+        system_prompt: If provided, use directly; otherwise built from env/artifact_path/task.
+        env: 'local' for host, else docker. Used to build prompt when system_prompt is None.
+        artifact_path: Artifact directory path (for prompt and initial message).
+        timeout_ms: Bash timeout in ms.
+        interactive: If True, enter interactive multi-turn loop after initial task.
+
+    Returns:
+        dict with keys: exit_code (int), output (str), message_count (int)
+    """
+    if not CLAUDE_SDK_AVAILABLE:
+        raise RuntimeError('claude_agent_sdk is not available. Install with: pip install claude-agent-sdk')
+
+    timeout_ms = resolve_timeout_ms(timeout_ms)
+    if system_prompt is None:
+        system_prompt = build_system_prompt(task, env=env, artifact_path=artifact_path, timeout_ms=timeout_ms)
 
     options = ClaudeAgentOptions(
-        system_prompt=base_prompt,
-        allowed_tools=["Read", "Write", "Bash"],
-        setting_sources=["user"],
+        model=model_name,
+        system_prompt=system_prompt,
+        allowed_tools=['Read', 'Write', 'Bash'],
+        setting_sources=['user'],
     )
 
-    formatter = None
-    if FORMATTER_AVAILABLE:
-        try:
-            formatter = MessageFormatter()
-            formatter.print_header()
-        except Exception as e:
-            print(f"WARNING: Failed to initialize MessageFormatter: {e}", file=sys.stderr)
+    initial_prompt = (
+        f'Please start the artifact task. Begin by changing to the artifact '
+        f'directory at {artifact_path} and examining its contents.'
+        if artifact_path
+        else 'Please start working on the artifact task. Begin by examining '
+        'the current directory and finding the artifact repository.'
+    )
 
-    print(f"\n{'='*60}", flush=True)
-    print(f"Starting AE Agent (Claude SDK) with model: {model_name}", flush=True)
-    print(f"Task: {task_description[:200]}..." if len(task_description) > 200 else f"Task: {task_description}", flush=True)
-    print(f"{'='*60}\n", flush=True)
+    print(f'\n{"=" * 60}', flush=True)
+    print(f'Starting Claude Agent SDK with model: {model_name}', flush=True)
+    print(f'{"=" * 60}\n', flush=True)
+
+    message_count = 0
+    result_text = ''
+
+    if interactive:
+        if ClaudeSDKClient is None:
+            raise RuntimeError('ClaudeSDKClient not available; cannot run interactive mode.')
+        async with ClaudeSDKClient(options=options) as client:
+            await client.query(initial_prompt)
+            async for message in client.receive_response():
+                message_count, result_text = _process_message(message, message_count, result_text)
+
+            print(f'\nInitial task done ({message_count} messages).', flush=True)
+            print('\n' + '=' * 60, flush=True)
+            print(
+                "Interactive mode — type instructions (or 'quit'/'exit' to end).",
+                flush=True,
+            )
+            print('=' * 60 + '\n', flush=True)
 
+            while True:
+                try:
+                    user_input = input('\n>>> ').strip()
+                except (EOFError, KeyboardInterrupt):
+                    print('\nExiting interactive mode.', flush=True)
+                    break
+                if not user_input:
+                    continue
+                if user_input.lower() in ('quit', 'exit', 'q'):
+                    print('Exiting interactive mode.', flush=True)
+                    break
+                await client.query(user_input)
+                async for msg in client.receive_response():
+                    message_count, result_text = _process_message(msg, message_count, result_text)
+
+        return {
+            'exit_code': 0 if message_count > 0 else 1,
+            'output': result_text,
+            'message_count': message_count,
+        }
+
+    # Non-interactive with rate-limit retry
     last_exception = None
-    for attempt in range(1, RATE_LIMIT_MAX_RETRIES + 1):
+    for attempt in range(1, _RATE_LIMIT_MAX_RETRIES + 1):
         try:
-            result_text = ""
+            result_text = ''
             message_count = 0
+            async for message in query(prompt=initial_prompt, options=options):
+                message_count, result_text = _process_message(message, message_count, result_text)
 
-            async for message in query(
-                prompt="Please start working on the artifact task described in the system prompt. Begin by changing to the artifact repository directory and examining the README or instructions.",
-                options=options
-            ):
-                message_count += 1
-                if message_count % 10 == 0:
-                    print(f"[Progress] Processed {message_count} messages...", flush=True)
-
-                if formatter:
-                    try:
-                        formatter.format_message(message)
-                    except Exception as e:
-                        print(f"WARNING: Failed to format message: {e}", file=sys.stderr, flush=True)
-                        print(str(message), flush=True)
-                else:
-                    print(str(message), flush=True)
-
-                msg_str = str(message)
-                if 'ResultMessage' in msg_str or 'TextBlock' in msg_str:
-                    result_text = msg_str
-
-            if formatter:
-                formatter.print_footer()
-
-            print(f"\n{'='*60}", flush=True)
-            print(f"AE Agent execution completed. Total messages: {message_count}", flush=True)
-            print(f"{'='*60}\n", flush=True)
-
-            if formatter:
-                try:
-                    metadata = formatter.get_api_metadata()
-                    if metadata:
-                        print(f"\nAPI Usage Metadata:", flush=True)
-                        print(f"  Input tokens: {metadata.get('input_tokens', 'N/A')}", flush=True)
-                        print(f"  Output tokens: {metadata.get('output_tokens', 'N/A')}", flush=True)
-                        print(f"  Total cost: ${metadata.get('total_cost', 'N/A')}", flush=True)
-                except Exception as e:
-                    print(f"WARNING: Failed to get metadata: {e}", file=sys.stderr, flush=True)
-
-            return 0
+            print(f'Completed. Total messages: {message_count}', flush=True)
+            return {
+                'exit_code': 0,
+                'output': result_text,
+                'message_count': message_count,
+            }
 
         except asyncio.TimeoutError as e:
-            print(f"\nERROR: AE Agent execution timed out: {e}", file=sys.stderr, flush=True)
-            if formatter:
-                formatter.print_footer()
-            return 1
+            logger.error('Timed out: %s', e)
+            return {
+                'exit_code': 1,
+                'output': f'Timeout: {e}',
+                'message_count': message_count,
+            }
         except Exception as e:
             last_exception = e
-            explicit_429 = _is_rate_limit_error(e)
-            wrapped_possible_429 = _is_possible_wrapped_rate_limit(e) and not explicit_429
-            max_retries = RATE_LIMIT_MAX_RETRIES if explicit_429 else RATE_LIMIT_WRAPPED_MAX_RETRIES
-            is_retriable = (explicit_429 or wrapped_possible_429) and attempt < max_retries
-            if is_retriable:
-                parsed = _parse_retry_after_seconds(e)
-                wait_sec = min(parsed, RATE_LIMIT_WAIT_MAX_SEC) if parsed is not None else min(
-                    RATE_LIMIT_WAIT_SEC * (2 ** (attempt - 1)), RATE_LIMIT_WAIT_MAX_SEC
+            explicit = _is_rate_limit_error(e)
+            wrapped = _is_possible_wrapped_rate_limit(e) and not explicit
+            max_r = _RATE_LIMIT_MAX_RETRIES if explicit else _RATE_LIMIT_WRAPPED_MAX_RETRIES
+            if (explicit or wrapped) and attempt < max_r:
+                parsed = _parse_retry_after(e)
+                wait = (
+                    min(parsed, _RATE_LIMIT_WAIT_MAX_SEC)
+                    if parsed
+                    else min(
+                        _RATE_LIMIT_WAIT_SEC * (2 ** (attempt - 1)),
+                        _RATE_LIMIT_WAIT_MAX_SEC,
+                    )
                 )
-                print(
-                    f"\nRate limit or API error. Waiting {wait_sec}s before retry (attempt {attempt}/{max_retries})...",
-                    file=sys.stderr, flush=True,
+                logger.warning(
+                    'Rate limit. Waiting %ds (attempt %d/%d)...',
+                    wait,
+                    attempt,
+                    max_r,
                 )
-                await asyncio.sleep(wait_sec)
+                await asyncio.sleep(wait)
                 continue
-            print(f"\nERROR: AE Agent execution failed: {e}", file=sys.stderr, flush=True)
-            import traceback
-            traceback.print_exc(file=sys.stderr)
-            sys.stderr.flush()
-            if formatter:
-                formatter.print_footer()
-            return 1
-
-    if last_exception:
-        print(f"\nERROR: AE Agent failed after {RATE_LIMIT_MAX_RETRIES} attempts: {last_exception}", file=sys.stderr, flush=True)
-    return 1
-
-
-def main():
-    if len(sys.argv) != 3:
-        print("Usage: python3 runner.py <model_name> <task_description_or_path>", file=sys.stderr)
-        print("Example: python3 runner.py claude-sonnet-4-5-20250929 /agent/current_task.txt", file=sys.stderr)
+            logger.error('%s', e, exc_info=True)
+            return {
+                'exit_code': 1,
+                'output': f'Error: {e}',
+                'message_count': message_count,
+            }
+
+    return {
+        'exit_code': 1,
+        'output': f'Failed after {_RATE_LIMIT_MAX_RETRIES} attempts: {last_exception}',
+        'message_count': 0,
+    }
+
+
+# ---------------------------------------------------------------------------
+# Standalone entry point (Docker container via runner.sh)
+# ---------------------------------------------------------------------------
+
+
+def _ensure_api_key() -> None:
+    """Ensure at least one API key is set; exit with error otherwise."""
+    if has_api_key():
+        return
+    logger.error('API key not set. Set ANTHROPIC_API_KEY or ANTHROPIC_FOUNDRY_API_KEY.')
+    sys.exit(1)
+
+
+_INTERACTIVE_SYSTEM_PROMPT = """\
+You are an experienced software engineer in an interactive session.
+
+ENVIRONMENT:
+- You are inside a Docker container with root permissions.
+- The artifact repository is at /repo. Change to it: cd /repo
+- You have access to Read, Write, and Bash tools.
+
+TIMEOUT: Long-running commands can take hours; do not set short timeouts.
+
+You will receive follow-up instructions from the user. Complete each one and respond.
+If the user asks to stop or says 'quit'/'exit', acknowledge and they will end the session."""
+
+# When running under arteval_bench, artifact is always at /repo
+_ARTIFACT_PATH_IN_CONTAINER = '/repo'
+
+
+def docker_main():
+    """Standalone entry point for running inside a Docker container via runner.sh."""
+    raw_args = sys.argv[1:]
+    interactive = '--interactive' in raw_args
+    args = [a for a in raw_args if a != '--interactive']
+
+    # Mode 1 — interactive-only (no task): runner.py --interactive [model]
+    if interactive and len(args) <= 1:
+        model = args[0] if args else os.environ.get('AE_AGENT_MODEL', DEFAULT_MODEL)
+        _ensure_api_key()
+        result = asyncio.run(
+            run_agent(
+                model,
+                'Please confirm you are in /repo and ready for follow-up instructions. Reply briefly.',
+                system_prompt=_INTERACTIVE_SYSTEM_PROMPT,
+                interactive=True,
+            )
+        )
+        sys.exit(result['exit_code'])
+
+    # Mode 2 — task execution: runner.py <model> <task_or_path> [--interactive]
+    if len(args) != 2:
+        print(
+            'Usage: python3 runner.py <model> <task_or_path> [--interactive]\n'
+            '       python3 runner.py --interactive [model]',
+            file=sys.stderr,
+        )
         sys.exit(1)
 
-    model_name = sys.argv[1]
-    task_arg = sys.argv[2]
+    model_name = args[0]
+    task_arg = args[1]
     if os.path.isfile(task_arg):
-        with open(task_arg, 'r', encoding='utf-8') as f:
-            task_description = f.read()
+        with open(task_arg, encoding='utf-8') as f:
+            task = f.read()
     else:
-        task_description = task_arg
+        task = task_arg
 
-    if not os.environ.get('ANTHROPIC_API_KEY') and not os.environ.get('ANTHROPIC_FOUNDRY_API_KEY'):
-        print("ERROR: ANTHROPIC_API_KEY or ANTHROPIC_FOUNDRY_API_KEY must be set.", file=sys.stderr)
-        sys.exit(1)
+    _ensure_api_key()
 
     try:
-        timeout_ms_env = os.environ.get("BASH_MAX_TIMEOUT_MS")
-        timeout_ms = int(timeout_ms_env) if timeout_ms_env is not None else DEFAULT_TIMEOUT_MS
+        raw = os.environ.get('BASH_MAX_TIMEOUT_MS')
+        timeout_ms = int(raw) if raw else None
     except ValueError:
-        timeout_ms = DEFAULT_TIMEOUT_MS
-    timeout_s = timeout_ms / 1000.0
+        timeout_ms = None
+    timeout_ms = resolve_timeout_ms(timeout_ms)
+
+    # In container (arteval_bench): artifact is at /repo
+    artifact_path = _ARTIFACT_PATH_IN_CONTAINER if os.path.isdir(_ARTIFACT_PATH_IN_CONTAINER) else None
 
     try:
-        exit_code = asyncio.run(
-            asyncio.wait_for(
-                run_agent(model_name, task_description),
-                timeout=timeout_s,
+        if interactive:
+            result = asyncio.run(
+                run_agent(
+                    model_name,
+                    task,
+                    env='docker',
+                    artifact_path=artifact_path,
+                    timeout_ms=timeout_ms,
+                    interactive=True,
+                )
             )
-        )
+        else:
+            result = asyncio.run(
+                asyncio.wait_for(
+                    run_agent(
+                        model_name,
+                        task,
+                        env='docker',
+                        artifact_path=artifact_path,
+                        timeout_ms=timeout_ms,
+                    ),
+                    timeout=timeout_ms / 1000.0,
+                )
+            )
+        sys.exit(result['exit_code'])
     except asyncio.TimeoutError:
-        print(f"ERROR: Agent execution exceeded timeout ({timeout_s} seconds).", file=sys.stderr, flush=True)
+        logger.error('Agent exceeded timeout.')
         sys.exit(1)
     except Exception as e:
-        print(f"ERROR: Failed to run agent: {e}", file=sys.stderr, flush=True)
-        import traceback
-        traceback.print_exc(file=sys.stderr)
-        sys.stderr.flush()
+        logger.error('%s', e, exc_info=True)
         sys.exit(1)
 
-    sys.exit(exit_code)
-
 
 if __name__ == '__main__':
-    main()
+    docker_main()
diff --git a/benchmarks/arteval_bench/src/agents/ae_agent/runner.sh b/benchmarks/arteval_bench/src/agents/ae_agent/runner.sh
index 090f8941..adf9bc69 100644
--- a/benchmarks/arteval_bench/src/agents/ae_agent/runner.sh
+++ b/benchmarks/arteval_bench/src/agents/ae_agent/runner.sh
@@ -1,23 +1,25 @@
 #!/bin/bash
-# AE Agent runner for ArtEvalBench. Invoked as: runner.sh <model> <task_or_path>
-# Do not use set -e; some commands may return non-zero without indicating failure.
 
+# Do not use set -e; some commands may return non-zero without indicating failure
+
+# Set the model and task as parameters (task can be text or path to file, e.g. /agent/current_task.txt)
 if [ $# -ne 2 ]; then
     echo "Usage: $0 <model_location> <task_description_or_path>"
-    echo "Example: $0 claude-sonnet-4-5-20250929 /agent/current_task.txt"
+    echo "Example: $0 claude-sonnet-4-5-20250929 \"Install and run tests\""
+    echo "         $0 claude-sonnet-4-5-20250929 /agent/current_task.txt"
     exit 1
 fi
 
-export ANTHROPIC_API_KEY="${ANTHROPIC_API_KEY}"
+# Disable Python buffering for real-time log output
 export PYTHONUNBUFFERED=1
 
-# 48h = 172800000 ms (align with benchmark long-running agent timeout)
+# Claude Agent SDK Bash timeout: use env if set, else default 96h (must match Python utils.DEFAULT_TIMEOUT_MS = 345_600_000)
 if [ -z "$BASH_MAX_TIMEOUT_MS" ]; then
-    export BASH_MAX_TIMEOUT_MS=172800000
+    export BASH_MAX_TIMEOUT_MS=345600000
 fi
 if [ -z "$BASH_DEFAULT_TIMEOUT_MS" ]; then
     export BASH_DEFAULT_TIMEOUT_MS="$BASH_MAX_TIMEOUT_MS"
 fi
 
-# Invoke Python runner (-u for unbuffered output). Second arg can be task text or path to file.
+# Invoke Python runner (-u for unbuffered output)
 python3 -u /agent/runner.py "$1" "$2"
diff --git a/benchmarks/arteval_bench/src/agents/ae_agent/utils.py b/benchmarks/arteval_bench/src/agents/ae_agent/utils.py
index b419804e..6f5955ad 100644
--- a/benchmarks/arteval_bench/src/agents/ae_agent/utils.py
+++ b/benchmarks/arteval_bench/src/agents/ae_agent/utils.py
@@ -1,70 +1,190 @@
-"""Helper for AE Agent runner and host/Docker orchestration (main.py, run_eval.py)."""
+"""Helper methods for running artifact tasks."""
+
+from __future__ import annotations
 
 import json
 import os
+import re
 import subprocess
 
-# Default total timeout in milliseconds (48h); used by runner.py and run_eval.
-DEFAULT_TIMEOUT_MS = 172_800_000
-
-
-def interactive_from_item(item: dict) -> bool:
-    """Whether to enable interactive mode (user can continue giving agent instructions after task completes)."""
-    v = item.get("interactive", False)
+__all__ = [
+    'AGENT_SUMMARY_FALLBACK_MAX',
+    'DEFAULT_DOCKER_IMAGE',
+    'DEFAULT_MODEL',
+    'DEFAULT_TIMEOUT_MS',
+    'LOG_OUTPUT_TRUNCATE_BYTES',
+    'SUMMARY_BASENAME_TEMPLATE',
+    'SUMMARY_INSTRUCTION',
+    'Tee',
+    'apply_timeout_env',
+    'clone_artifact_repo',
+    'compute_and_write_summary',
+    'docker_image_from_item',
+    'env_from_item',
+    'get_task',
+    'gpu_from_item',
+    'has_api_key',
+    'interactive_from_item',
+    'is_local_env',
+    'parse_eval_score',
+    'read_task_from_file',
+    'resolve_project_path',
+    'resolve_timeout_ms',
+    'safe_task_id',
+    'status_from_exit_code',
+    'timeout_env_dict',
+    'timeout_ms_from_item',
+    'write_task_report',
+]
+
+# Default total timeout in milliseconds (96h); used by run_eval and runner.
+# Single source: runner.py fallback and runner.sh (345600000) must match when utils is unavailable.
+DEFAULT_TIMEOUT_MS = 345_600_000
+
+# Default Docker image and model when not specified.
+DEFAULT_DOCKER_IMAGE = 'bastoica/ae-agent-ubuntu24.04:latest'
+DEFAULT_MODEL = 'claude-sonnet-4-5-20250929'
+
+# File naming templates for reports and summaries.
+SUMMARY_BASENAME_TEMPLATE = 'ae_summary_{safe_id}.md'
+SUMMARY_INSTRUCTION = (
+    '\n\nAt the end, write a brief summary of what you did and the result to '
+    '{basename} in the artifact root (so it can be included in the report).'
+)
+LOG_OUTPUT_TRUNCATE_BYTES = 50000
+AGENT_SUMMARY_FALLBACK_MAX = 8000
+
+
+def timeout_env_dict(timeout_ms: int) -> dict[str, str]:
+    """Return env vars dict for Bash timeout (single source for env and settings file)."""
+    return {
+        'BASH_MAX_TIMEOUT_MS': str(timeout_ms),
+        'BASH_DEFAULT_TIMEOUT_MS': str(timeout_ms),
+    }
+
+
+def apply_timeout_env(timeout_ms: int) -> None:
+    """Set BASH_MAX_TIMEOUT_MS and BASH_DEFAULT_TIMEOUT_MS in os.environ."""
+    os.environ.update(timeout_env_dict(timeout_ms))
+
+
+def resolve_timeout_ms(timeout_ms: int | None) -> int:
+    """Return timeout_ms if set, else DEFAULT_TIMEOUT_MS. Single place for default."""
+    return timeout_ms if timeout_ms is not None else DEFAULT_TIMEOUT_MS
+
+
+def has_api_key() -> bool:
+    """True if at least one of ANTHROPIC_API_KEY or ANTHROPIC_FOUNDRY_API_KEY is set."""
+    return bool(os.environ.get('ANTHROPIC_API_KEY') or os.environ.get('ANTHROPIC_FOUNDRY_API_KEY'))
+
+
+def status_from_exit_code(exit_code: int) -> str:
+    """Map process exit code to eval status string. Non-zero (incl. -1 for unknown) → 'error'."""
+    return 'success' if exit_code == 0 else 'error'
+
+
+def is_local_env(env: str) -> bool:
+    """True if env denotes local (host) execution rather than Docker."""
+    return str(env).strip().lower() == 'local'
+
+
+def _parse_bool_value(v, default: bool = False) -> bool:
+    """Parse a value (bool, str, or other) to bool. Strings 'true', '1', 'yes' → True."""
     if isinstance(v, bool):
         return v
     if isinstance(v, str):
-        return v.strip().lower() in ("true", "1", "yes")
+        return v.strip().lower() in ('true', '1', 'yes')
     return bool(v)
 
 
-def safe_task_id(task_id: str | None, fallback: str = "unknown") -> str:
+# Default task template when artifact_readme is not specified.
+_DEFAULT_TASK_TEMPLATE = (
+    'You are an experienced software engineer.'
+    ' You are asked to navigate to the {file_path} and follow step-by-step'
+    ' instructions to set up, install, compile, and reproduce the results in'
+    ' that code repository. You have root access inside a Docker image, which'
+    ' means you can directly proceed with executing the steps in the README'
+    ' without asking for approval or confirmation. Once you reached the end'
+    ' of the README you must exit the Docker image gracefully.'
+)
+
+
+def interactive_from_item(item: dict) -> bool:
+    """Whether to enable interactive mode (user can continue giving agent instructions after task completes)."""
+    return _parse_bool_value(item.get('interactive', False))
+
+
+def safe_task_id(task_id: str | None, fallback: str = 'unknown') -> str:
     """Normalize task_id for use in filenames (no spaces, lowercase)."""
-    return (task_id or fallback).replace(" ", "_").lower()
+    return (task_id or fallback).replace(' ', '_').lower()
 
 
 def timeout_ms_from_item(item: dict) -> int | None:
-    """Parse timeout from task item. Returns ms (int) or None for default."""
-    v = item.get("timeout", None)
+    """Parse timeout from task item. Returns ms (int) or None for default.
+
+    Accepts either ``timeout_sec`` (seconds, preferred) or ``timeout_ms``
+    (milliseconds). Falls back to the legacy ``timeout`` field, which is
+    treated as seconds if < 86_400 (24 hours), otherwise milliseconds.
+    """
+    if 'timeout_sec' in item:
+        v = item['timeout_sec']
+        if isinstance(v, (int, float)):
+            return int(v * 1000)
+        return None
+    if 'timeout_ms' in item:
+        v = item['timeout_ms']
+        if isinstance(v, (int, float)):
+            return int(v)
+        return None
+    v = item.get('timeout', None)
     if v is None:
         return None
     if isinstance(v, (int, float)):
-        return int(v * 1000) if v < 1_000_000 else int(v)
+        # Legacy heuristic: 86400 = 24h in seconds; values below are treated as
+        # seconds, else as milliseconds (e.g. 345600000 = 96h).
+        return int(v * 1000) if v < 86_400 else int(v)
     return None
 
 
 def env_from_item(item: dict) -> str:
     """Resolve env from task item: 'local' = host, else = docker. Backward compat: run_on_host/docker_env."""
-    env = item.get("env", None)
+    env = item.get('env', None)
     if env is not None:
         s = str(env).strip().lower()
-        return "local" if s == "local" else (str(env).strip() or "docker")
-    return "local" if item.get("run_on_host", False) else "docker"
+        return 'local' if s == 'local' else (str(env).strip() or 'docker')
+    return 'local' if item.get('run_on_host', False) else 'docker'
 
 
 def gpu_from_item(item: dict) -> bool:
     """Whether to enable GPU access in Docker. Default False (no host GPU passed to container)."""
-    v = item.get("gpu", False)
-    if isinstance(v, bool):
-        return v
-    if isinstance(v, str):
-        return v.strip().lower() in ("true", "1", "yes")
-    return bool(v)
+    return _parse_bool_value(item.get('gpu', False))
 
 
 def docker_image_from_item(
     item: dict,
-    default: str = "bastoica/ae-agent-ubuntu24.04:latest",
+    default: str | None = None,
+    *,
+    env: str | None = None,
 ) -> str | None:
-    """Resolve Docker image from task item. Returns None when env is local."""
-    if env_from_item(item) == "local":
+    """Resolve Docker image from task item.
+
+    When env is 'local', returns None (no Docker). Otherwise returns, in order:
+    item['env'] if it looks like an image name, item['docker_env'], or default.
+    If env is provided (e.g. from env_from_item), avoids parsing env twice.
+    """
+    resolved = (env if env is not None else env_from_item(item)).strip().lower()
+    if resolved == 'local':
         return None
-    env = item.get("env", None)
-    if env is not None:
-        s = str(env).strip()
-        if s and s.lower() != "local":
+    env_val = item.get('env', None)
+    if env_val is not None:
+        s = str(env_val).strip()
+        if s and s.lower() != 'local':
             return s
-    return item.get("docker_env", None) or item.get("docer_env", None) or default
+    return (
+        item.get('docker_env', None)
+        or item.get('docer_env', None)
+        or (default or DEFAULT_DOCKER_IMAGE)
+    )
 
 
 def get_task(file_path: str) -> str:
@@ -76,16 +196,7 @@ def get_task(file_path: str) -> str:
     Returns:
         Task description string for the agent
     """
-    task = (
-        f"You are an experienced software engineer."
-        + f" You are asked to navigate to the {file_path} and follow step-by-step"
-        + f" instructions to set up, install, compile, and reproduce the results in"
-        + f" that code repository. You have root access inside a Docker image, which"
-        + f" means you can directly proceed with executing the steps in the README"
-        + f" without asking for approval or confirmation. Once you reached the end"
-        + f" of the README you must exit the Docker image gracefully."
-    )
-    return task
+    return _DEFAULT_TASK_TEMPLATE.format(file_path=file_path)
 
 
 def read_task_from_file(artifact_path: str, task_file: str) -> str:
@@ -100,83 +211,121 @@ def read_task_from_file(artifact_path: str, task_file: str) -> str:
     """
     task_file_path = os.path.join(artifact_path, task_file)
     if os.path.exists(task_file_path):
-        with open(task_file_path, 'r', encoding='utf-8') as f:
+        with open(task_file_path, encoding='utf-8') as f:
             return f.read()
     else:
         return get_task(task_file)
 
 
 def clone_artifact_repo(artifact_url: str, target_dir: str) -> str:
-    """Clone artifact repository from URL into target_dir."""
+    """Clone artifact repository from URL into target_dir.
+
+    Args:
+        artifact_url: Git clone URL (e.g. https://github.com/org/repo.git)
+        target_dir: Absolute path to the directory to clone into (must not exist or be empty).
+
+    Returns:
+        target_dir (artifact root path after clone).
+
+    Raises:
+        RuntimeError: If git clone fails.
+    """
     if os.path.exists(target_dir) and os.listdir(target_dir):
         return target_dir
     if os.path.exists(target_dir):
         os.rmdir(target_dir)
     r = subprocess.run(
-        ["git", "clone", "--depth", "1", artifact_url, target_dir],
+        ['git', 'clone', '--depth', '1', artifact_url, target_dir],
         capture_output=True,
         text=True,
         timeout=600,
     )
     if r.returncode != 0:
-        raise RuntimeError(f"git clone failed: {r.stderr or r.stdout}")
+        raise RuntimeError(f'git clone failed: {r.stderr or r.stdout}')
     return target_dir
 
 
 def resolve_project_path(item: dict, input_file: str, save_path: str) -> tuple[str | None, str | None]:
     """Resolve artifact project path from task item.
 
+    When both artifact_url and artifact_dir are set, if the local path
+    (input_dir/artifact_dir) already exists, it is used and no clone is performed.
+    Otherwise the repo is cloned from artifact_url into save_path/workspace/<task_id>.
+
     Returns:
         (project_path, error_message). If error_message is not None, skip task.
     """
     input_dir = os.path.dirname(os.path.abspath(input_file))
-    artifact_dir = item.get("artifact_dir")
-    artifact_url = item.get("artifact_url")
-    task_id = item.get("artifact_id")
+    artifact_dir = item.get('artifact_dir')
+    artifact_url = item.get('artifact_url')
+    task_id = item.get('artifact_id')
     sid = safe_task_id(task_id)
 
     if artifact_url:
         candidate = os.path.join(input_dir, artifact_dir) if artifact_dir else None
         if candidate and os.path.isdir(candidate):
             return os.path.abspath(candidate), None
-        workspace_dir = os.path.join(save_path, "workspace", sid)
+        workspace_dir = os.path.join(save_path, 'workspace', sid)
         os.makedirs(os.path.dirname(workspace_dir), exist_ok=True)
         return clone_artifact_repo(artifact_url, workspace_dir), None
     if not artifact_dir:
-        return None, f"Skipping task {task_id}: missing artifact_dir and artifact_url"
+        return None, f'Skipping task {task_id}: missing artifact_dir and artifact_url'
     path = os.path.abspath(os.path.join(input_dir, artifact_dir))
     if not os.path.isdir(path):
-        return None, f"Project path does not exist: {path}"
+        return None, f'Project path does not exist: {path}'
     return path, None
 
 
 class Tee:
-    """Write to both original stream and a log file."""
+    """Write to both an original stream and a log file.
+
+    Implements enough of the TextIO interface to serve as a drop-in
+    replacement for sys.stdout / sys.stderr (supports libraries that
+    probe encoding, isatty, etc.).
+    """
 
     def __init__(self, stream, log_path: str):
+        """Wrap stream and log_path for dual write."""
         self._stream = stream
         self._path = log_path
         self._file = None
 
     def __enter__(self):
-        self._file = open(self._path, "a", encoding="utf-8")
+        """Open log file and return self."""
+        self._file = open(self._path, 'a', encoding='utf-8')
         return self
 
     def __exit__(self, *args):
+        """Close log file."""
         if self._file:
             self._file.close()
 
     def write(self, data):
+        """Write to both stream and log file."""
         self._stream.write(data)
         if self._file:
             self._file.write(data)
             self._file.flush()
 
     def flush(self):
+        """Flush both stream and log file."""
         self._stream.flush()
         if self._file:
             self._file.flush()
 
+    @property
+    def encoding(self) -> str:
+        """Return underlying stream encoding or utf-8."""
+        return getattr(self._stream, 'encoding', 'utf-8')
+
+    def isatty(self) -> bool:
+        """Return whether underlying stream is a TTY."""
+        return getattr(self._stream, 'isatty', lambda: False)()
+
+    def fileno(self) -> int:
+        """Return underlying stream fileno."""
+        return self._stream.fileno()
+
 
 def write_task_report(
     save_path: str,
@@ -187,49 +336,74 @@ def write_task_report(
     agent_summary: str,
 ) -> None:
     """Write ae_report_<safe_id>.md for a single task."""
-    report_path = os.path.join(save_path, f"ae_report_{safe_id}.md")
-    saved_image = result.get("saved_image")
-    with open(report_path, "w", encoding="utf-8") as fw:
-        fw.write(f"# AE Report: {task_id}\n\n")
-        fw.write(f"- **Status**: {result.get('status', 'unknown')}\n")
-        fw.write(f"- **Timestamp**: {result.get('timestamp', '')}\n")
-        fw.write(f"- **Project path**: {result.get('project_path', '')}\n")
-        fw.write(f"- **Run on host**: {result.get('run_on_host', False)}\n")
-        fw.write(f"- **Log file**: `{log_path}`\n\n")
+    report_path = os.path.join(save_path, f'ae_report_{safe_id}.md')
+    saved_image = result.get('saved_image')
+    with open(report_path, 'w', encoding='utf-8') as fw:
+        fw.write(f'# AE Report: {task_id}\n\n')
+        fw.write(f'- **Status**: {result.get("status", "unknown")}\n')
+        fw.write(f'- **Timestamp**: {result.get("timestamp", "")}\n')
+        fw.write(f'- **Project path**: {result.get("project_path", "")}\n')
+        fw.write(f'- **Run on host**: {result.get("run_on_host", False)}\n')
+        fw.write(f'- **Log file**: `{log_path}`\n\n')
         if saved_image:
-            fw.write("> [!Note]\n")
-            fw.write("> ## To check the result\n")
-            fw.write(">\n")
-            fw.write("> You can run the following command to manually check the result:\n")
-            fw.write(">\n")
-            fw.write("> ```bash\n")
-            fw.write(f"> docker run -it {saved_image} bash\n")
-            fw.write("> ```\n")
-            fw.write(">\n")
-            fw.write(f"> Image: `{saved_image}`\n\n")
-        fw.write("## Agent summary\n\n")
+            fw.write('> [!Note]\n')
+            fw.write('> ## To check the result\n')
+            fw.write('>\n')
+            fw.write('> You can run the following command to manually check the result:\n')
+            fw.write('>\n')
+            fw.write('> ```bash\n')
+            fw.write(f'> docker run -it {saved_image} bash\n')
+            fw.write('> ```\n')
+            fw.write('>\n')
+            fw.write(f'> Image: `{saved_image}`\n\n')
+        fw.write('## Agent summary\n\n')
         fw.write(agent_summary)
-        fw.write("\n")
+        fw.write('\n')
+
+
+def parse_eval_score(output) -> int:
+    """Parse evaluation score from evaluator script output (string or object with .output).
+
+    - If a line is a single digit (e.g. '4', '0'), use it (prefer last such line).
+    - If output contains 'Agent scores: {...}' (Oracle-style evaluator), count ': 1' as passed items.
+    - Otherwise return 0.
+    """
+    s = (getattr(output, 'output', None) or str(output) or '').strip()
+    if not s:
+        return 0
+    lines = s.splitlines()
+    for line in reversed(lines):
+        t = line.strip()
+        if t.isdigit():
+            return int(t)
+    m = re.search(r'Agent scores:\s*\{[^}]*\}', s)
+    if m:
+        return m.group(0).count(': 1')
+    return 0
 
 
 def compute_and_write_summary(save_path: str) -> tuple[int, int]:
-    """Read result.jsonl, compute total/success, write summary.json. Returns (total_count, success_count)."""
-    result_path = os.path.join(save_path, "result.jsonl")
+    """Read result.jsonl, compute total/success, write summary.json.
+
+    total = number of result lines (success + error + skipped). success = status == "success".
+    Returns (total_count, success_count).
+    """
+    result_path = os.path.join(save_path, 'result.jsonl')
     total, success = 0, 0
     if os.path.isfile(result_path):
-        with open(result_path, encoding="utf-8") as f:
+        with open(result_path, encoding='utf-8') as f:
             for line in f:
                 if not line.strip():
                     continue
                 try:
                     row = json.loads(line.strip())
                     total += 1
-                    if row.get("status") == "success":
+                    if row.get('status') == 'success':
                         success += 1
                 except json.JSONDecodeError:
                     continue
     rate = success / total if total > 0 else 0.0
-    summary = {"total_tasks": total, "successful_tasks": success, "success_rate": rate}
-    with open(os.path.join(save_path, "summary.json"), "w", encoding="utf-8") as f:
+    summary = {'total_tasks': total, 'successful_tasks': success, 'success_rate': rate}
+    with open(os.path.join(save_path, 'summary.json'), 'w', encoding='utf-8') as f:
         json.dump(summary, f, indent=4)
     return total, success
diff --git a/benchmarks/arteval_bench/src/main.py b/benchmarks/arteval_bench/src/main.py
index 43c7390a..134c680e 100644
--- a/benchmarks/arteval_bench/src/main.py
+++ b/benchmarks/arteval_bench/src/main.py
@@ -6,7 +6,10 @@
 import sys
 from datetime import datetime
 
-sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../../../')))
+_src_dir = os.path.dirname(os.path.abspath(__file__))
+if _src_dir not in sys.path:
+    sys.path.insert(0, _src_dir)
+sys.path.append(os.path.abspath(os.path.join(_src_dir, '../../../')))
 
 from sdk.logger import logger
 from sdk.utils import set_llm_endpoint_from_config
@@ -113,12 +116,13 @@ def main(file_path, model, agent, save_path):
         timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
         save_path = os.path.join('./outputs', f'env_setup_project__{str_model_name}__{args.agent}__{timestamp}')
 
+    _src_dir = os.path.dirname(os.path.abspath(__file__))
     if agent == 'claudecode':
-        agent = './src/agents/claudecode'
+        agent = os.path.join(_src_dir, 'agents', 'claudecode')
     elif agent == 'claude_sdk':
-        agent = './src/agents/claude_sdk'
+        agent = os.path.join(_src_dir, 'agents', 'claude_sdk')
     elif agent == 'ae_agent' or agent == 'ae-agent':
-        agent = './src/agents/ae_agent'
+        agent = os.path.join(_src_dir, 'agents', 'ae_agent')
     save_path = os.path.abspath(os.path.expanduser(save_path))
     os.makedirs(save_path, exist_ok=True)
 
diff --git a/benchmarks/arteval_bench/src/run_eval_in_env.py b/benchmarks/arteval_bench/src/run_eval_in_env.py
index 190fadb0..748addfc 100644
--- a/benchmarks/arteval_bench/src/run_eval_in_env.py
+++ b/benchmarks/arteval_bench/src/run_eval_in_env.py
@@ -65,50 +65,73 @@ def setup_claude_settings_on_host():
     logger.info(f"Created {settings_file} with 48-hour timeout configuration.")
 
 
+def _is_ae_agent_path(agent_path) -> bool:
+    """True if agent_path points to the ae_agent agent (same flow: agent + evaluation script)."""
+    if not agent_path:
+        return False
+    p = (agent_path or "").rstrip(os.sep)
+    return p.endswith("ae_agent") or os.path.basename(p) == "ae_agent"
+
+
 async def run_eval_on_host(project_path, task_id, task, model, agent_path, test_method, save_path):
     """Run evaluation directly on host machine (no Docker container).
-    
-    This is useful for tasks that require Kind clusters or other Docker-in-Docker
-    scenarios that don't work well in nested containers.
+
+    When agent is ae_agent, delegates to ae_agent.run_agent_then_eval (agent run + evaluation script),
+    same flow as claude_sdk. Otherwise uses inline Claude SDK + test_method.
     """
     logger.info("=" * 80)
     logger.info("Running evaluation directly on HOST MACHINE (not in Docker)")
     logger.info("=" * 80)
-    
-    # Check prerequisites
+
+    if _is_ae_agent_path(agent_path):
+        logger.info("Using ae_agent flow: run agent then evaluation script.")
+        try:
+            from agents.ae_agent.run_eval import _run_agent_then_eval_async
+        except ImportError:
+            _src = os.path.dirname(os.path.abspath(__file__))
+            if _src not in sys.path:
+                sys.path.insert(0, _src)
+            from agents.ae_agent.run_eval import _run_agent_then_eval_async
+        result = await _run_agent_then_eval_async(
+            project_path=project_path,
+            task_id=task_id,
+            task=task,
+            model=model,
+            test_method=test_method,
+            save_path=save_path,
+            timeout_ms=None,
+            skip_prereq_check=False,
+        )
+        return result
+
+    # Original flow: inline Claude SDK then test_method (e.g. claude_sdk or default)
     import shutil
-    
+
     if not shutil.which("docker"):
         raise RuntimeError("Docker is not installed on host")
-    
-    # Check if Docker is running
+
     result = subprocess.run(["docker", "ps"], capture_output=True, timeout=10)
     if result.returncode != 0:
         raise RuntimeError("Docker is not running on host")
-    
-    # Check API key
+
     if not os.environ.get("ANTHROPIC_API_KEY"):
         raise RuntimeError("ANTHROPIC_API_KEY environment variable is not set")
-    
-    # Setup Claude settings
+
     setup_claude_settings_on_host()
-    
-    # Ensure project path is absolute
+
     project_path = os.path.abspath(project_path)
     if not os.path.isdir(project_path):
         raise RuntimeError(f"Project path does not exist: {project_path}")
-    
+
     logger.info(f"Project path: {project_path}")
     logger.info(f"Task ID: {task_id}")
     logger.info(f"Model: {model}")
-    
-    # Import Claude Agent SDK
+
     try:
         from claude_agent_sdk import query, ClaudeAgentOptions
     except ImportError as e:
         raise RuntimeError(f"claude_agent_sdk not installed: {e}. Install with: pip install claude-agent-sdk")
-    
-    # Build system prompt for host execution
+
     system_prompt = f"""You are an experienced software engineer completing an artifact evaluation task.
 
 ENVIRONMENT SETUP (HOST MACHINE - NOT DOCKER):
@@ -143,70 +166,62 @@ async def run_eval_on_host(project_path, task_id, task, model, agent_path, test_
     options = ClaudeAgentOptions(
         system_prompt=system_prompt,
         allowed_tools=["Read", "Write", "Bash"],
-        setting_sources=["user"],  # Load ~/.claude/settings.json for timeout config
+        setting_sources=["user"],
     )
-    
-    # Set environment variables
+
     os.environ['BASH_MAX_TIMEOUT_MS'] = '172800000'
     os.environ['BASH_DEFAULT_TIMEOUT_MS'] = '172800000'
-    
+
     logger.info("Starting Claude Agent SDK (Host Mode)...")
-    
+
     message_count = 0
     run_results_output = ""
-    
+
     try:
         async for message in query(
             prompt=f"Please start the artifact evaluation task. Begin by changing to the artifact directory at {project_path} and examining its contents.",
             options=options
         ):
             message_count += 1
-            
             if message_count % 10 == 0:
                 logger.info(f"[Progress] Processed {message_count} messages...")
-            
-            # Log each message
             msg_str = str(message)
             logger.info(msg_str)
-            
             if 'ResultMessage' in msg_str or 'TextBlock' in msg_str:
                 run_results_output = msg_str
-        
         logger.info(f"Claude Agent SDK execution completed. Total messages: {message_count}")
-        
     except Exception as e:
         logger.error(f"Claude Agent SDK execution failed: {e}")
         import traceback
         traceback.print_exc()
         run_results_output = f"Error: {e}"
-    
-    # Run evaluation (test_method)
+
     logger.info("Running evaluation script...")
     try:
-        # Change to project directory and run test
         eval_cmd = f"cd {project_path} && {test_method}"
         eval_result = subprocess.run(
             eval_cmd,
             shell=True,
             capture_output=True,
             text=True,
-            timeout=300  # 5 minute timeout for evaluation
+            timeout=300
         )
         test_output = eval_result.stdout.strip()
         logger.info(f"Evaluation output: {test_output}")
-        
         result = {
+            'task_id': task_id,
             'task': task,
             'project_path': project_path,
             'agent_run_results': run_results_output,
             'test_method': test_method,
-            'score': int(test_output) if test_output.isdigit() else 0,
+            'score': _parse_eval_score(test_output),
             'status': 'success',
             'run_on_host': True,
         }
     except Exception as e:
         logger.error(f"Error running test method: {e}")
         result = {
+            'task_id': task_id,
             'task': task,
             'project_path': project_path,
             'agent_run_results': run_results_output,
@@ -215,7 +230,7 @@ async def run_eval_on_host(project_path, task_id, task, model, agent_path, test_
             'status': f'error: {str(e)}',
             'run_on_host': True,
         }
-    
+
     return result
 
 
diff --git a/benchmarks/arteval_bench/src/utils.py b/benchmarks/arteval_bench/src/utils.py
new file mode 100644
index 00000000..56bc657f
--- /dev/null
+++ b/benchmarks/arteval_bench/src/utils.py
@@ -0,0 +1,4 @@
+"""Re-export get_task for main.py when run from benchmark root (python src/main.py)."""
+from core.utils import get_task
+
+__all__ = ["get_task"]
diff --git a/sdk/utils.py b/sdk/utils.py
index cbd79357..995fdfaf 100644
--- a/sdk/utils.py
+++ b/sdk/utils.py
@@ -62,22 +62,37 @@ def set_llm_endpoint_from_config(config_path):
                 logger.warning('  - %s', key)
             logger.warning('Only [evaluator_api_keys] values will be used for both evaluator and model under test.')
 
-    # First, set environment variables from [llm]
+    # Placeholder values that should not override an existing env var (e.g. from export)
+    _placeholders = frozenset({'', 'xxx', 'sk-xxxx', 'sk-xxx', 'xxx'})
+
+    def _is_placeholder(val):
+        if val is None:
+            return True
+        s = str(val).strip().lower()
+        return not s or s in _placeholders or s.startswith('sk-xxx')
+
+    # First, set environment variables from [llm] (do not overwrite existing non-placeholder env)
     logger.info('Setting the following environment variables from [llm]:')
     for key, value in llm_config.items():
+        if _is_placeholder(value) and os.environ.get(key) and not _is_placeholder(os.environ.get(key)):
+            logger.info('%s: (keeping existing env)', key)
+            continue
         logger.info('%s', f'{key}: [REDACTED]' if 'key' in key.lower() else f'{key}: {value}')
-        os.environ[key] = value
+        os.environ[key] = str(value)
         # add exception for SWE-Agent:
         if key == 'AZURE_API_KEY':
-            os.environ['AZURE_OPENAI_API_KEY'] = value
+            os.environ['AZURE_OPENAI_API_KEY'] = str(value)
             logger.info('AZURE_OPENAI_API_KEY: [REDACTED]')
 
     # Then, set environment variables from [evaluator_api_keys] (will override [llm] if conflict)
     logger.info('Setting the following environment variables from [evaluator_api_keys]:')
     for key, value in evaluator_config.items():
+        if _is_placeholder(value) and os.environ.get(key) and not _is_placeholder(os.environ.get(key)):
+            logger.info('%s: (keeping existing env)', key)
+            continue
         logger.info('%s', f'{key}: [REDACTED]' if 'key' in key.lower() else f'{key}: {value}')
-        os.environ[key] = value
+        os.environ[key] = str(value)
         # add exception for SWE-Agent:
         if key == 'AZURE_API_KEY':
-            os.environ['AZURE_OPENAI_API_KEY'] = value
+            os.environ['AZURE_OPENAI_API_KEY'] = str(value)
             logger.info('AZURE_OPENAI_API_KEY: [REDACTED]')

From cefd7c1033a1cada08bbab1959e7119f2878dc5f Mon Sep 17 00:00:00 2001
From: couen <2631403308@qq.com>
Date: Fri, 27 Feb 2026 04:33:53 +0000
Subject: [PATCH 5/7] chore: unify task jsonl format - use env instead of
 docker_env/docer_env, omit optional fields

Made-with: Cursor
---
 .../data/benchmark/ae_agent_smoke_test.jsonl       |  4 ++--
 .../data/benchmark/arteval_tasks.jsonl             | 12 ++++++------
 benchmarks/arteval_bench/src/main.py               | 14 ++++++++++++--
 3 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/benchmarks/arteval_bench/data/benchmark/ae_agent_smoke_test.jsonl b/benchmarks/arteval_bench/data/benchmark/ae_agent_smoke_test.jsonl
index 3971f37e..de29995c 100644
--- a/benchmarks/arteval_bench/data/benchmark/ae_agent_smoke_test.jsonl
+++ b/benchmarks/arteval_bench/data/benchmark/ae_agent_smoke_test.jsonl
@@ -1,2 +1,2 @@
-{"artifact_id": "ae_agent_smoke_host", "artifact_dir": "ae_agent_smoke", "artifact_readme": "ae_agent_smoke/README.md", "evaluator": "python3 _agent_eval/check.py", "expected_score": 1, "run_on_host": true}
-{"artifact_id": "ae_agent_smoke_docker", "artifact_dir": "ae_agent_smoke", "artifact_readme": "ae_agent_smoke/README.md", "docker_env": "bastoica/ae-agent-ubuntu24.04:latest", "evaluator": "python3 _agent_eval/check.py", "expected_score": 1, "run_on_host": false}
+{"artifact_id": "ae_agent_smoke_host", "artifact_dir": "ae_agent_smoke", "artifact_readme": "ae_agent_smoke/README.md", "evaluator": "python3 _agent_eval/check.py", "expected_score": 1, "env": "local"}
+{"artifact_id": "ae_agent_smoke_docker", "artifact_dir": "ae_agent_smoke", "artifact_readme": "ae_agent_smoke/README.md", "evaluator": "python3 _agent_eval/check.py", "expected_score": 1, "env": "bastoica/ae-agent-ubuntu24.04:latest", "timeout": 120000}
diff --git a/benchmarks/arteval_bench/data/benchmark/arteval_tasks.jsonl b/benchmarks/arteval_bench/data/benchmark/arteval_tasks.jsonl
index 6d02f195..8928ead5 100644
--- a/benchmarks/arteval_bench/data/benchmark/arteval_tasks.jsonl
+++ b/benchmarks/arteval_bench/data/benchmark/arteval_tasks.jsonl
@@ -1,6 +1,6 @@
-{"artifact_id": "sosp24_wasabi", "artifact_dir": "sosp24_wasabi", "artifact_readme": "sosp24_wasabi/wasabi/README.md", "artifact_url": "https://github.com/bastoica/wasabi/tree/sosp24-ae", "evaluator": "sosp24_wasabi/wasabi/_agent_eval/main.py", "expected_score": 4, "docer_env": "bastoica/ae-agent-ubuntu24.04:latest"}
-{"artifact_id": "osdi24_anvil", "artifact_dir": "osdi24_anvil", "artifact_readme": "osdi24_anvil/anvil/README.md", "artifact_url": "https://github.com/anvil-verifier/anvil", "evaluator": "osdi24_anvil/_agent_eval/main.py", "expected_score": 4, "docer_env": "bastoica/ae-agent-ubuntu24.04:latest"}
-{"artifact_id": "sosp23_acto", "artifact_dir": "sosp23_acto", "artifact_readme": "sosp23_acto/acto/README.md", "artifact_url": "https://github.com/xlab-uiuc/acto", "evaluator": "sosp23_acto/_agent_eval/main.py", "expected_score": 4, "docer_env": "bastoica/ae-agent-ubuntu24.04:latest"}
-{"artifact_id": "eurosys25_egwalker", "artifact_dir": "eurosys25_egwalker", "artifact_readme": "eurosys25_egwalker/egwalker/README.md", "artifact_url": "https://github.com/josephg/egwalker-paper", "evaluator": "eurosys25_egwalker/_agent_eval/main.py", "expected_score": 4, "docer_env": "bastoica/ae-agent-ubuntu24.04:latest"}
-{"artifact_id": "eurosys25_depsurf", "artifact_dir": "eurosys25_depsurf", "artifact_readme": "eurosys25_depsurf/depsurf/README.md", "artifact_url": "https://github.com/ShawnZhong/DepSurf", "evaluator": "eurosys25_depsurf/_agent_eval/main.py", "expected_score": 4, "docer_env": "bastoica/ae-agent-ubuntu24.04:latest"}
-{"artifact_id": "osdi24_eet", "artifact_dir": "osdi24_eet", "artifact_readme": "osdi24_eet/eet/README.md", "artifact_url": "https://github.com/JZuming/EET", "evaluator": "osdi24_eet/_agent_eval/main.py", "expected_score": 4, "docer_env": "bastoica/ae-agent-ubuntu24.04:latest"}
+{"artifact_id": "sosp24_wasabi", "artifact_dir": "sosp24_wasabi", "artifact_readme": "sosp24_wasabi/wasabi/README.md", "artifact_url": "https://github.com/bastoica/wasabi/tree/sosp24-ae", "evaluator": "sosp24_wasabi/wasabi/_agent_eval/main.py", "expected_score": 4, "env": "bastoica/ae-agent-ubuntu24.04:latest"}
+{"artifact_id": "osdi24_anvil", "artifact_dir": "osdi24_anvil", "artifact_readme": "osdi24_anvil/anvil/README.md", "artifact_url": "https://github.com/anvil-verifier/anvil", "evaluator": "osdi24_anvil/_agent_eval/main.py", "expected_score": 4, "env": "bastoica/ae-agent-ubuntu24.04:latest"}
+{"artifact_id": "sosp23_acto", "artifact_dir": "sosp23_acto", "artifact_readme": "sosp23_acto/acto/README.md", "artifact_url": "https://github.com/xlab-uiuc/acto", "evaluator": "sosp23_acto/_agent_eval/main.py", "expected_score": 4, "env": "bastoica/ae-agent-ubuntu24.04:latest"}
+{"artifact_id": "eurosys25_egwalker", "artifact_dir": "eurosys25_egwalker", "artifact_readme": "eurosys25_egwalker/egwalker/README.md", "artifact_url": "https://github.com/josephg/egwalker-paper", "evaluator": "eurosys25_egwalker/_agent_eval/main.py", "expected_score": 4, "env": "bastoica/ae-agent-ubuntu24.04:latest"}
+{"artifact_id": "eurosys25_depsurf", "artifact_dir": "eurosys25_depsurf", "artifact_readme": "eurosys25_depsurf/depsurf/README.md", "artifact_url": "https://github.com/ShawnZhong/DepSurf", "evaluator": "eurosys25_depsurf/_agent_eval/main.py", "expected_score": 4, "env": "bastoica/ae-agent-ubuntu24.04:latest"}
+{"artifact_id": "osdi24_eet", "artifact_dir": "osdi24_eet", "artifact_readme": "osdi24_eet/eet/README.md", "artifact_url": "https://github.com/JZuming/EET", "evaluator": "osdi24_eet/_agent_eval/main.py", "expected_score": 4, "env": "bastoica/ae-agent-ubuntu24.04:latest"}
diff --git a/benchmarks/arteval_bench/src/main.py b/benchmarks/arteval_bench/src/main.py
index 134c680e..d3702027 100644
--- a/benchmarks/arteval_bench/src/main.py
+++ b/benchmarks/arteval_bench/src/main.py
@@ -33,12 +33,22 @@ def main(file_path, model, agent, save_path):
                 logger.info(f'Skipping invalid JSON line: {line}')
                 continue
 
-            deployment = item.get('docker_env', None) or item.get('docer_env', None)
+            env_val = item.get('env', None)
+            if env_val is not None:
+                s = str(env_val).strip().lower()
+                if s == 'local':
+                    run_on_host = True
+                    deployment = None
+                else:
+                    run_on_host = False
+                    deployment = str(env_val).strip() or None
+            else:
+                deployment = item.get('docker_env', None) or item.get('docer_env', None)
+                run_on_host = item.get('run_on_host', False)
             project_path = f"./data/benchmark/{item.get('artifact_dir', None)}"
             task_file = item.get('artifact_readme', None)
             task_id = item.get('artifact_id', None)
             test_method = item.get('evaluator', None)
-            run_on_host = item.get('run_on_host', False)
 
             task = get_task(task_file)
 

From 511bfa3436548fda347b93aa961b9027746fc79d Mon Sep 17 00:00:00 2001
From: couen <2631403308@qq.com>
Date: Sun, 1 Mar 2026 14:02:28 +0000
Subject: [PATCH 6/7] Refactor arteval bench: ae_agent integration and code
 cleanup

- main.py: Extract _is_ae_agent(agent) helper and use it for report/summary
  writing; use json.dumps(..., ensure_ascii=False) for result.jsonl
- run_eval_in_env.py: Remove unused Path import in interactive foreground
  path; reuse _get_container_id_from_runtime for long-running agent block
  instead of duplicating container ID resolution
- README: Update usage and JSONL/CLI options
---
 benchmarks/arteval_bench/README.md            |  62 ++
 benchmarks/arteval_bench/env.toml             |   2 +-
 .../src/agents/ae_agent/README.md             |   8 +-
 .../arteval_bench/src/agents/ae_agent/main.py |  30 +-
 .../src/agents/ae_agent/run_eval.py           |  88 ++-
 .../src/agents/ae_agent/runner.py             |  32 +-
 .../src/agents/ae_agent/utils.py              |  48 +-
 benchmarks/arteval_bench/src/main.py          | 121 +++-
 .../arteval_bench/src/run_eval_in_env.py      | 540 ++++++++++++------
 9 files changed, 718 insertions(+), 213 deletions(-)

diff --git a/benchmarks/arteval_bench/README.md b/benchmarks/arteval_bench/README.md
index bfc30f77..84cc1b78 100644
--- a/benchmarks/arteval_bench/README.md
+++ b/benchmarks/arteval_bench/README.md
@@ -182,5 +182,67 @@ The benchmark supports multiple AI agents:
 - **Claude Code**: Anthropic's code assistant
 - **Mini SWE Agent**: The compact version of [SWE-agent](https://github.com/SWE-agent) assistant
 - **OpenHands**: Open-source coding agent
+- **ae_agent**: Claude Agent SDK–based agent (same logic as the standalone [artifact-agent](https://github.com/sys-intelligence/artifact-agent) repo), with full support for host/Docker, interactive mode, Skill, Sub-agent, per-task timeout, GPU, and optional container sync/commit/stop.
 
 To add your own agent to the benchmark, see [add_agents.md](add_agents.md).
+
+#### » ae_agent usage and options
+
+When using the **ae_agent** (`-a ae_agent` or `-a ae-agent`), you can pass the following from the command line and/or the task JSONL.
+
+**Command-line arguments**
+
+| Argument | Description |
+|----------|-------------|
+| `-i`, `--input_file` | Input JSONL file with tasks (default: `./data/benchmark/arteval_tasks.jsonl`). |
+| `-o`, `--save_path` | Directory for results (default: `./outputs/ae_<model>_ae-agent_<timestamp>`). |
+| `-a`, `--agent` | Agent name; use `ae_agent` or `ae-agent` for this agent. |
+| `-m`, `--model_name` | Model name (e.g. `claude-sonnet-4-5-20250929`). |
+| `--interactive` | After the task completes, keep a session open so you can give more instructions (requires a TTY). In Docker mode the runner is executed in the foreground via `docker exec -it`. |
+| `--enable-skill` | Enable Claude Agent SDK Skill (load from `~/.claude/skills/` and `.claude/skills/`). |
+| `--enable-subagent` | Enable Claude Agent SDK Sub-agent (Task tool). |
+
+**JSONL task fields (per line)**
+
+| Field | Description |
+|-------|-------------|
+| `artifact_id` | Unique task identifier. |
+| `artifact_dir` | Artifact directory name (relative to the JSONL file’s directory). |
+| `artifact_readme` | Path to the README or task description file (relative to artifact root). |
+| `artifact_url` | Optional. Git clone URL; used when `artifact_dir` is missing or the path does not exist. |
+| `env` | `"local"` for host; Docker image name (e.g. `bastoica/ae-agent-ubuntu24.04:latest`) for Docker. |
+| `evaluator` | Command to run after the agent (e.g. `python _agent_eval/main.py`). |
+| `expected_score` | Expected score for this artifact (default 4). |
+| `timeout` | Optional. Per-task timeout in seconds or milliseconds (see utils: values &lt; 86400 are seconds, else milliseconds). |
+| `gpu` | Optional. When `true`, pass `--gpus all` to Docker (Docker mode only). |
+| `interactive` | Optional. When `true`, enable interactive mode for this task (overrides CLI default). |
+| `enable_skill` | Optional. When `true`, enable Skill for this task. |
+| `enable_subagent` | Optional. When `true`, enable Sub-agent for this task. |
+| `keep_container` | Optional. When `false` (default for ae_agent), after the run the workspace is synced from the container to the host, the container is committed as an image, and the container is stopped. When `true`, the container is left running for inspection. |
+
+**Examples**
+
+```sh
+# Host mode, default options
+python src/main.py -i ./data/benchmark/arteval_tasks.jsonl -a ae_agent -o ./outputs/run1
+
+# With interactive mode (TTY required for Docker)
+python src/main.py --interactive -i ./data/benchmark/arteval_tasks.jsonl -a ae_agent -o ./outputs/run2
+
+# Enable Skill and Sub-agent
+python src/main.py --enable-skill --enable-subagent -i ./data/benchmark/arteval_tasks.jsonl -a ae_agent -o ./outputs/run3
+```
+
+**Outputs (when using ae_agent)**
+
+Results are written under the given `save_path`:
+
+- `result.jsonl` — One JSON object per task (task_id, status, score, agent_run_results, etc.).
+- `avg_score.json` — Benchmark summary (final_score, total_tasks).
+- `ae_report_<artifact_id>.md` — Per-task report (status, project path, log file, agent summary, and optional Docker image instructions).
+- `summary.json` — Total and successful task counts and success rate (same format as standalone artifact-agent).
+- When running via the benchmark entry, log paths and agent summary are filled from available data; standalone `python -m ae_agent.main` also produces `ae_log_<artifact_id>.log`.
+
+**Docker + interactive**
+
+For Docker tasks with `interactive: true` (or `--interactive`), the benchmark runs the agent in the foreground via `docker exec -it` so you can interact in the same terminal. This requires a real TTY (e.g. running `python src/main.py ...` in a terminal, not under CI or with redirected stdin). If stdin is not a TTY, the run falls back to non-interactive (background runner) and a warning is logged.
diff --git a/benchmarks/arteval_bench/env.toml b/benchmarks/arteval_bench/env.toml
index eac33edd..564e06ab 100644
--- a/benchmarks/arteval_bench/env.toml
+++ b/benchmarks/arteval_bench/env.toml
@@ -2,7 +2,7 @@
 AZURE_API_KEY = "XXX"
 AZURE_API_BASE = "XXXX"
 AZURE_API_VERSION = "XXX"
-ANTHROPIC_API_KEY = "sk-XXXX"
+ANTHROPIC_API_KEY = "YOUR_ANTHROPIC_API_KEY"
 
 [hardware]
 use_gpu = false
diff --git a/benchmarks/arteval_bench/src/agents/ae_agent/README.md b/benchmarks/arteval_bench/src/agents/ae_agent/README.md
index bf0e3aa9..f630da1d 100644
--- a/benchmarks/arteval_bench/src/agents/ae_agent/README.md
+++ b/benchmarks/arteval_bench/src/agents/ae_agent/README.md
@@ -30,16 +30,16 @@ The benchmark will:
 4. **Evaluation script flow** (same as claude_sdk): after the agent finishes, run the JSONL `evaluator` (test_method), e.g. `cd /repo && python _agent_eval/main.py`, parse output for `score` and write to result.
 5. If set, pass through `ANTHROPIC_API_KEY`, `ANTHROPIC_FOUNDRY_API_KEY`, `ANTHROPIC_FOUNDRY_BASE_URL`, `CLAUDE_CODE_USE_FOUNDRY`.
 
-**Evaluation flow on host**: When `run_on_host=True` and the agent is ae_agent, `run_eval_in_env.run_eval_on_host` calls this package’s `run_agent_then_eval()`: run the agent first, then run `test_method` on the host (e.g. `cd project_path && python _agent_eval/main.py`), parse score with `utils.parse_eval_score()`, and return a result with the same shape as the Docker path (`score`, `test_method`, `status`).
+**Evaluation flow on host**: When `run_on_host=True` and the agent is ae_agent, `run_eval_in_env.run_eval_on_host` calls this package's `run_agent_then_eval()`: run the agent first, then run `test_method` on the host (e.g. `cd project_path && python _agent_eval/main.py`), parse score with `utils.parse_eval_score()`, and return a result with the same shape as the Docker path (`score`, `test_method`, `status`).
 
 ## Dependencies
 
 - Python 3; `claude-agent-sdk` is installed in the container via `install.sh`.
-- When running in Docker via the benchmark’s `run_eval_in_env.py`, install `swerex` on the host (the benchmark includes it). When using this directory’s `main.py` for Docker mode standalone, you also need `swe-rex`.
+- When running in Docker via the benchmark's `run_eval_in_env.py`, install `swerex` on the host (the benchmark includes it). When using this directory's `main.py` for Docker mode standalone, you also need `swe-rex`.
 
 ## Running on host (local)
 
-You can run tasks on the **host** from this directory (without the benchmark’s Docker flow):
+You can run tasks on the **host** from this directory (without the benchmark's Docker flow):
 
 1. **Single or batch via main.py**  
    Use a JSONL where each line can set `"env": "local"` or `"run_on_host": true` to run that task on the host; others run in Docker (requires swerex).
@@ -59,4 +59,4 @@ You can run tasks on the **host** from this directory (without the benchmark’s
 
 ## Relation to the standalone ae-agent repo
 
-The standalone ae-agent repo provides the same host/Docker CLI. This sub-agent includes both the **in-container** runner (used by the benchmark’s `run_eval_in_env.py`) and **host/local** mode via `main.py` and `run_eval.py`.
+The standalone ae-agent repo provides the same host/Docker CLI. This sub-agent includes both the **in-container** runner (used by the benchmark's `run_eval_in_env.py`) and **host/local** mode via `main.py` and `run_eval.py`.
diff --git a/benchmarks/arteval_bench/src/agents/ae_agent/main.py b/benchmarks/arteval_bench/src/agents/ae_agent/main.py
index d88cdc27..ac39a3a7 100644
--- a/benchmarks/arteval_bench/src/agents/ae_agent/main.py
+++ b/benchmarks/arteval_bench/src/agents/ae_agent/main.py
@@ -25,6 +25,8 @@
     Tee,
     compute_and_write_summary,
     docker_image_from_item,
+    enable_skill_from_item,
+    enable_subagent_from_item,
     env_from_item,
     get_task,
     gpu_from_item,
@@ -89,12 +91,16 @@ def _run_single_task(
     save_path: str,
     input_file: str,
     interactive_default: bool,
+    enable_skill_default: bool = False,
+    enable_subagent_default: bool = False,
 ) -> None:
     """Process a single JSONL task: parse, run, write results and report."""
     env = env_from_item(item)
     docker_image = docker_image_from_item(item, env=env)
     use_gpu = gpu_from_item(item)
     interactive = interactive_from_item(item) or interactive_default
+    enable_skill = enable_skill_from_item(item, enable_skill_default)
+    enable_subagent = enable_subagent_from_item(item, enable_subagent_default)
     task_file = item.get('artifact_readme', None)
     task_id = item.get('artifact_id', None)
     timeout_ms = timeout_ms_from_item(item)
@@ -115,7 +121,7 @@ def _run_single_task(
         f.write(task)
 
     timeout_str = str(timeout_ms) if timeout_ms is not None else 'default'
-    print(f'Task {task_id}: env={env}, timeout_ms={timeout_str}, gpu={use_gpu}, interactive={interactive}')
+    print(f'Task {task_id}: env={env}, timeout_ms={timeout_str}, gpu={use_gpu}, interactive={interactive}, enable_skill={enable_skill}, enable_subagent={enable_subagent}')
 
     log_path = os.path.join(save_path, f'ae_log_{safe_id}.log')
     with open(log_path, 'w', encoding='utf-8') as lf:
@@ -143,6 +149,8 @@ def _run_single_task(
                     timeout_ms=timeout_ms,
                     use_gpu=use_gpu,
                     interactive=interactive,
+                    enable_skill=enable_skill,
+                    enable_subagent=enable_subagent,
                 )
     except Exception as e:
         sys.stdout, sys.stderr = old_stdout, old_stderr
@@ -160,7 +168,7 @@ def _run_single_task(
     print(f'Task {task_id} completed. Status: {result.get("status", "unknown")}')
 
 
-def main(input_file, model, agent, save_path, interactive_default: bool = False):
+def main(input_file, model, agent, save_path, interactive_default: bool = False, enable_skill_default: bool = False, enable_subagent_default: bool = False):
     """Main function for running tasks."""
     if not os.path.isfile(input_file):
         logging.error('Input file not found: %s', input_file)
@@ -186,6 +194,8 @@ def main(input_file, model, agent, save_path, interactive_default: bool = False)
                 save_path=save_path,
                 input_file=input_file,
                 interactive_default=interactive_default,
+                enable_skill_default=enable_skill_default,
+                enable_subagent_default=enable_subagent_default,
             )
 
     total_count, success_count = compute_and_write_summary(save_path)
@@ -201,6 +211,8 @@ class _ResolvedConfig:
     agent: str
     save_path: str
     interactive_default: bool
+    enable_skill_default: bool
+    enable_subagent_default: bool
 
 
 def _parse_args() -> argparse.Namespace:
@@ -230,6 +242,16 @@ def _parse_args() -> argparse.Namespace:
         action='store_true',
         help='Enable interactive mode (continue giving agent instructions after task completes)',
     )
+    parser.add_argument(
+        '--enable-skill',
+        action='store_true',
+        help='Enable Claude Agent SDK Skill (load from ~/.claude/skills/ and .claude/skills/)',
+    )
+    parser.add_argument(
+        '--enable-subagent',
+        action='store_true',
+        help='Enable Claude Agent SDK Sub-agent (Task tool)',
+    )
     return parser.parse_args()
 
 
@@ -258,6 +280,8 @@ def _resolve_paths(args: argparse.Namespace) -> _ResolvedConfig:
         agent=agent,
         save_path=save_path,
         interactive_default=getattr(args, 'interactive', False),
+        enable_skill_default=getattr(args, 'enable_skill', False),
+        enable_subagent_default=getattr(args, 'enable_subagent', False),
     )
 
 
@@ -274,6 +298,8 @@ def cli_main():
         config.agent,
         config.save_path,
         interactive_default=config.interactive_default,
+        enable_skill_default=config.enable_skill_default,
+        enable_subagent_default=config.enable_subagent_default,
     )
 
 
diff --git a/benchmarks/arteval_bench/src/agents/ae_agent/run_eval.py b/benchmarks/arteval_bench/src/agents/ae_agent/run_eval.py
index dd14ce25..aaba73cc 100644
--- a/benchmarks/arteval_bench/src/agents/ae_agent/run_eval.py
+++ b/benchmarks/arteval_bench/src/agents/ae_agent/run_eval.py
@@ -172,6 +172,8 @@ async def _run_local(
     *,
     skip_prereq_check: bool = False,
     interactive: bool = False,
+    enable_skill: bool = False,
+    enable_subagent: bool = False,
 ):
     """Run one task on host by delegating to runner.run_agent()."""
     print('=' * 80)
@@ -199,6 +201,8 @@ async def _run_local(
         artifact_path=project_path,
         timeout_ms=timeout_ms,
         interactive=interactive,
+        enable_skill=enable_skill,
+        enable_subagent=enable_subagent,
     )
 
     return _make_eval_result(
@@ -230,6 +234,9 @@ async def _run_agent_then_eval_async(
     timeout_ms: int | None = None,
     *,
     skip_prereq_check: bool = False,
+    interactive: bool = False,
+    enable_skill: bool = False,
+    enable_subagent: bool = False,
 ) -> dict:
     """Run agent on host, then run evaluation script (test_method); return result with score.
 
@@ -253,7 +260,9 @@ async def _run_agent_then_eval_async(
         env='local',
         artifact_path=project_path,
         timeout_ms=timeout_ms,
-        interactive=False,
+        interactive=interactive,
+        enable_skill=enable_skill,
+        enable_subagent=enable_subagent,
     )
     agent_output = agent_result['output']
     agent_status = status_from_exit_code(agent_result['exit_code'])
@@ -307,6 +316,9 @@ def run_agent_then_eval(
     timeout_ms: int | None = None,
     *,
     skip_prereq_check: bool = False,
+    interactive: bool = False,
+    enable_skill: bool = False,
+    enable_subagent: bool = False,
 ) -> dict:
     """Synchronous entry: run agent on host then evaluation script; return result with score.
 
@@ -322,6 +334,9 @@ def run_agent_then_eval(
             save_path,
             timeout_ms=timeout_ms,
             skip_prereq_check=skip_prereq_check,
+            interactive=interactive,
+            enable_skill=enable_skill,
+            enable_subagent=enable_subagent,
         )
     )
 
@@ -475,6 +490,15 @@ def _save_container(
     return image_tag, stopped
 
 
+def save_container_after_run(container_id: str, project_path: str, task_id: str) -> tuple[str | None, bool]:
+    """Sync workspace from container to host, commit as image, stop container.
+
+    Public entry for run_eval_in_env when keep_container=False (original artifact-agent behavior).
+    Returns (saved_image_tag, container_stopped).
+    """
+    return _save_container(container_id, project_path, task_id)
+
+
 async def _get_container_id(runtime) -> str | None:
     """Get container hostname/ID from inside the container."""
     try:
@@ -496,8 +520,13 @@ def _shell_escape(s: str) -> str:
     return s.replace("'", "'\"'\"'")
 
 
-def _build_api_env_dict(timeout_ms: int) -> dict[str, str]:
-    """Build env vars dict for API keys, Foundry, and timeouts.
+def _build_api_env_dict(
+    timeout_ms: int,
+    *,
+    enable_skill: bool = False,
+    enable_subagent: bool = False,
+) -> dict[str, str]:
+    """Build env vars dict for API keys, Foundry, timeouts, and SDK options.
 
     Single source of truth for _docker_exec_env_args and _setup_container_env.
     """
@@ -515,12 +544,25 @@ def _build_api_env_dict(timeout_ms: int) -> dict[str, str]:
         env['ANTHROPIC_FOUNDRY_BASE_URL'] = foundry_url
     if os.environ.get('CLAUDE_CODE_USE_FOUNDRY') == '1':
         env['CLAUDE_CODE_USE_FOUNDRY'] = '1'
+    if enable_skill:
+        env['AE_ENABLE_SKILL'] = '1'
+    if enable_subagent:
+        env['AE_ENABLE_SUBAGENT'] = '1'
     return env
 
 
-def _docker_exec_env_args(timeout_ms: int) -> list[str]:
+def _docker_exec_env_args(
+    timeout_ms: int,
+    *,
+    enable_skill: bool = False,
+    enable_subagent: bool = False,
+) -> list[str]:
     """Build -e VAR=value args for docker exec (env vars needed by runner.py)."""
-    env = _build_api_env_dict(timeout_ms)
+    env = _build_api_env_dict(
+        timeout_ms,
+        enable_skill=enable_skill,
+        enable_subagent=enable_subagent,
+    )
     args = []
     for k, v in env.items():
         args.extend(['-e', f'{k}={v}'])
@@ -547,9 +589,15 @@ async def _upload_task(runtime, task: str, task_file_path: str | None):
         shutil.rmtree(tmpdir, ignore_errors=True)
 
 
-async def _setup_container_env(runtime, timeout_ms: int):
+async def _setup_container_env(
+    runtime, timeout_ms: int, *, enable_skill: bool = False, enable_subagent: bool = False
+):
     """Set timeout and API keys inside the container."""
-    env = _build_api_env_dict(timeout_ms)
+    env = _build_api_env_dict(
+        timeout_ms,
+        enable_skill=enable_skill,
+        enable_subagent=enable_subagent,
+    )
     parts = [f"export {k}='{_shell_escape(v)}'" for k, v in env.items()]
     await _run_bash(runtime, ' && '.join(parts))
 
@@ -690,6 +738,9 @@ async def _run_interactive_in_container(
     project_path: str,
     model: str,
     timeout_ms: int,
+    *,
+    enable_skill: bool = False,
+    enable_subagent: bool = False,
 ) -> dict:
     """Run task + interactive in foreground via docker exec -it.
 
@@ -707,7 +758,11 @@ async def _run_interactive_in_container(
         'docker',
         'exec',
         '-it',
-        *_docker_exec_env_args(timeout_ms),
+        *_docker_exec_env_args(
+            timeout_ms,
+            enable_skill=enable_skill,
+            enable_subagent=enable_subagent,
+        ),
         container_id,
         'python3',
         '-u',
@@ -751,6 +806,8 @@ async def _run_in_docker(  # noqa: C901
     *,
     task_file_path: str | None = None,
     interactive: bool = False,
+    enable_skill: bool = False,
+    enable_subagent: bool = False,
 ) -> dict:
     """Run task inside a Docker container.
 
@@ -785,7 +842,9 @@ async def _run_in_docker(  # noqa: C901
     )
 
     await _upload_task(runtime, task, task_file_path)
-    await _setup_container_env(runtime, timeout_ms)
+    await _setup_container_env(
+        runtime, timeout_ms, enable_skill=enable_skill, enable_subagent=enable_subagent
+    )
 
     container_id = await _get_container_id(runtime)
     result = None
@@ -793,7 +852,10 @@ async def _run_in_docker(  # noqa: C901
     try:
         # Prefer foreground interactive when container_id is available and stdin is a TTY.
         if interactive and container_id and _stdin_is_tty():
-            result = await _run_interactive_in_container(container_id, task_id, task, project_path, model, timeout_ms)
+            result = await _run_interactive_in_container(
+                container_id, task_id, task, project_path, model, timeout_ms,
+                enable_skill=enable_skill, enable_subagent=enable_subagent,
+            )
         else:
             if interactive and not _stdin_is_tty():
                 print(
@@ -884,6 +946,8 @@ def run_eval(
     use_gpu: bool = False,
     task_file_path: str | None = None,
     interactive: bool = False,
+    enable_skill: bool = False,
+    enable_subagent: bool = False,
 ) -> dict:
     """Run task in the given environment (local host or Docker).
 
@@ -902,6 +966,8 @@ def run_eval(
                 timeout_ms,
                 skip_prereq_check=skip_prereq_check,
                 interactive=interactive,
+                enable_skill=enable_skill,
+                enable_subagent=enable_subagent,
             )
         )
 
@@ -939,5 +1005,7 @@ def run_eval(
             timeout_ms,
             task_file_path=task_file_path,
             interactive=interactive,
+            enable_skill=enable_skill,
+            enable_subagent=enable_subagent,
         )
     )
diff --git a/benchmarks/arteval_bench/src/agents/ae_agent/runner.py b/benchmarks/arteval_bench/src/agents/ae_agent/runner.py
index 972573c2..4393e5e3 100644
--- a/benchmarks/arteval_bench/src/agents/ae_agent/runner.py
+++ b/benchmarks/arteval_bench/src/agents/ae_agent/runner.py
@@ -107,7 +107,13 @@ def _parse_retry_after(exc: BaseException) -> int | None:
     return int(m.group(1)) if m else None
 
 
-# Shared prompt fragments (avoids duplication and keeps host/docker logic aligned).
+def _parse_env_bool(env_var: str, default: bool = False) -> bool:
+    """Parse env var as bool. '1', 'true', 'yes' -> True."""
+    v = os.environ.get(env_var, '').strip().lower()
+    return v in ('1', 'true', 'yes') if v else default
+
+
+# Shared prompt fragments
 _PROMPT_TIMEOUT_HOST = (
     'TIMEOUT CONFIGURATION (CRITICAL):\n'
     '- Long-running commands (builds, tests, Kind cluster creation) are expected\n'
@@ -202,6 +208,8 @@ async def run_agent(  # noqa: C901
     artifact_path: str | None = None,
     timeout_ms: int | None = None,
     interactive: bool = False,
+    enable_skill: bool = False,
+    enable_subagent: bool = False,
 ) -> dict:
     """Run the agent using Claude SDK. Single implementation for all modes.
 
@@ -213,6 +221,8 @@ async def run_agent(  # noqa: C901
         artifact_path: Artifact directory path (for prompt and initial message).
         timeout_ms: Bash timeout in ms.
         interactive: If True, enter interactive multi-turn loop after initial task.
+        enable_skill: If True, enable Claude Agent SDK Skill (load from ~/.claude/skills/).
+        enable_subagent: If True, enable Claude Agent SDK Sub-agent (Task tool).
 
     Returns:
         dict with keys: exit_code (int), output (str), message_count (int)
@@ -224,11 +234,18 @@ async def run_agent(  # noqa: C901
     if system_prompt is None:
         system_prompt = build_system_prompt(task, env=env, artifact_path=artifact_path, timeout_ms=timeout_ms)
 
+    allowed_tools = ['Read', 'Write', 'Bash']
+    if enable_skill:
+        allowed_tools.append('Skill')
+    if enable_subagent:
+        allowed_tools.append('Task')
+    setting_sources = ['user', 'project'] if enable_skill else ['user']
+
     options = ClaudeAgentOptions(
         model=model_name,
         system_prompt=system_prompt,
-        allowed_tools=['Read', 'Write', 'Bash'],
-        setting_sources=['user'],
+        allowed_tools=allowed_tools,
+        setting_sources=setting_sources,
     )
 
     initial_prompt = (
@@ -379,6 +396,9 @@ def docker_main():
     interactive = '--interactive' in raw_args
     args = [a for a in raw_args if a != '--interactive']
 
+    enable_skill = _parse_env_bool('AE_ENABLE_SKILL', False)
+    enable_subagent = _parse_env_bool('AE_ENABLE_SUBAGENT', False)
+
     # Mode 1 — interactive-only (no task): runner.py --interactive [model]
     if interactive and len(args) <= 1:
         model = args[0] if args else os.environ.get('AE_AGENT_MODEL', DEFAULT_MODEL)
@@ -389,6 +409,8 @@ def docker_main():
                 'Please confirm you are in /repo and ready for follow-up instructions. Reply briefly.',
                 system_prompt=_INTERACTIVE_SYSTEM_PROMPT,
                 interactive=True,
+                enable_skill=enable_skill,
+                enable_subagent=enable_subagent,
             )
         )
         sys.exit(result['exit_code'])
@@ -432,6 +454,8 @@ def docker_main():
                     artifact_path=artifact_path,
                     timeout_ms=timeout_ms,
                     interactive=True,
+                    enable_skill=enable_skill,
+                    enable_subagent=enable_subagent,
                 )
             )
         else:
@@ -443,6 +467,8 @@ def docker_main():
                         env='docker',
                         artifact_path=artifact_path,
                         timeout_ms=timeout_ms,
+                        enable_skill=enable_skill,
+                        enable_subagent=enable_subagent,
                     ),
                     timeout=timeout_ms / 1000.0,
                 )
diff --git a/benchmarks/arteval_bench/src/agents/ae_agent/utils.py b/benchmarks/arteval_bench/src/agents/ae_agent/utils.py
index 6f5955ad..89497b2f 100644
--- a/benchmarks/arteval_bench/src/agents/ae_agent/utils.py
+++ b/benchmarks/arteval_bench/src/agents/ae_agent/utils.py
@@ -19,12 +19,15 @@
     'apply_timeout_env',
     'clone_artifact_repo',
     'compute_and_write_summary',
+    'parse_artifact_url',
     'docker_image_from_item',
     'env_from_item',
     'get_task',
     'gpu_from_item',
     'has_api_key',
     'interactive_from_item',
+    'enable_skill_from_item',
+    'enable_subagent_from_item',
     'is_local_env',
     'parse_eval_score',
     'read_task_from_file',
@@ -114,6 +117,16 @@ def interactive_from_item(item: dict) -> bool:
     return _parse_bool_value(item.get('interactive', False))
 
 
+def enable_skill_from_item(item: dict, default: bool = False) -> bool:
+    """Whether to enable Claude Agent SDK Skill (load from ~/.claude/skills/ and .claude/skills/)."""
+    return _parse_bool_value(item.get('enable_skill', default))
+
+
+def enable_subagent_from_item(item: dict, default: bool = False) -> bool:
+    """Whether to enable Claude Agent SDK Sub-agent (Task tool)."""
+    return _parse_bool_value(item.get('enable_subagent', default))
+
+
 def safe_task_id(task_id: str | None, fallback: str = 'unknown') -> str:
     """Normalize task_id for use in filenames (no spaces, lowercase)."""
     return (task_id or fallback).replace(' ', '_').lower()
@@ -217,12 +230,35 @@ def read_task_from_file(artifact_path: str, task_file: str) -> str:
         return get_task(task_file)
 
 
-def clone_artifact_repo(artifact_url: str, target_dir: str) -> str:
+def parse_artifact_url(artifact_url: str) -> tuple[str, str | None]:
+    """Parse artifact URL into (clone_url, branch) for git clone.
+
+    Supports GitHub-style URLs:
+    - https://github.com/org/repo -> (https://github.com/org/repo.git, None)
+    - https://github.com/org/repo/tree/branch -> (https://github.com/org/repo.git, branch)
+    """
+    url = (artifact_url or '').strip()
+    if not url:
+        return url, None
+    # .../tree/<branch> or .../tree/<branch>/
+    tree_match = re.search(r'^(.*?)/tree/([^/#]+?)/?$', url)
+    if tree_match:
+        base, branch = tree_match.group(1), tree_match.group(2).strip()
+        if not base.endswith('.git'):
+            base = base.rstrip('/') + '.git'
+        return base, branch if branch else None
+    if not url.endswith('.git'):
+        url = url.rstrip('/') + '.git'
+    return url, None
+
+
+def clone_artifact_repo(artifact_url: str, target_dir: str, branch: str | None = None) -> str:
     """Clone artifact repository from URL into target_dir.
 
     Args:
-        artifact_url: Git clone URL (e.g. https://github.com/org/repo.git)
+        artifact_url: Git clone URL (e.g. https://github.com/org/repo or .../repo/tree/branch).
         target_dir: Absolute path to the directory to clone into (must not exist or be empty).
+        branch: Optional branch to clone. If None, parse_artifact_url(artifact_url) is used.
 
     Returns:
         target_dir (artifact root path after clone).
@@ -234,8 +270,14 @@ def clone_artifact_repo(artifact_url: str, target_dir: str) -> str:
         return target_dir
     if os.path.exists(target_dir):
         os.rmdir(target_dir)
+    clone_url, parsed_branch = parse_artifact_url(artifact_url)
+    use_branch = branch if branch is not None else parsed_branch
+    cmd = ['git', 'clone', '--depth', '1']
+    if use_branch:
+        cmd.extend(['-b', use_branch])
+    cmd.extend([clone_url, target_dir])
     r = subprocess.run(
-        ['git', 'clone', '--depth', '1', artifact_url, target_dir],
+        cmd,
         capture_output=True,
         text=True,
         timeout=600,
diff --git a/benchmarks/arteval_bench/src/main.py b/benchmarks/arteval_bench/src/main.py
index d3702027..75222211 100644
--- a/benchmarks/arteval_bench/src/main.py
+++ b/benchmarks/arteval_bench/src/main.py
@@ -19,7 +19,47 @@
 from run_eval_in_env import run_eval
 from utils import get_task
 
-def main(file_path, model, agent, save_path):
+from agents.ae_agent.utils import (
+    enable_skill_from_item,
+    enable_subagent_from_item,
+    gpu_from_item,
+    interactive_from_item,
+    resolve_project_path,
+    safe_task_id,
+    timeout_ms_from_item,
+    write_task_report,
+    compute_and_write_summary,
+)
+
+
+def _persist_skipped(save_path: str, task_id: str, message: str, expected_score: int = -1) -> None:
+    """Append one result line for a skipped task so summary total is accurate (same as ae-agent)."""
+    result = {
+        'task_id': task_id,
+        'status': 'skipped',
+        'message': message,
+        'expected_score': expected_score,
+    }
+    with open(os.path.join(save_path, 'result.jsonl'), 'a+', encoding='utf-8') as fw:
+        fw.write(json.dumps(result, ensure_ascii=False) + '\n')
+
+
+def _parse_bool(v, default=False):
+    if isinstance(v, bool):
+        return v
+    if isinstance(v, str):
+        return v.strip().lower() in ('true', '1', 'yes')
+    return bool(v) if v is not None else default
+
+
+def _is_ae_agent(agent):
+    """True if agent path points to the ae_agent (for report/summary writing)."""
+    if not agent:
+        return False
+    return 'ae_agent' in agent or os.path.basename(agent) == 'ae_agent'
+
+
+def main(file_path, model, agent, save_path, interactive_default=False, enable_skill_default=False, enable_subagent_default=False):
     """Main function for running the benchmark."""
     logger.info(f'Using model: {model}, agent: {agent}')
     with open(file_path) as f:
@@ -45,14 +85,34 @@ def main(file_path, model, agent, save_path):
             else:
                 deployment = item.get('docker_env', None) or item.get('docer_env', None)
                 run_on_host = item.get('run_on_host', False)
-            project_path = f"./data/benchmark/{item.get('artifact_dir', None)}"
-            task_file = item.get('artifact_readme', None)
             task_id = item.get('artifact_id', None)
+            project_path, path_error = resolve_project_path(item, file_path, save_path)
+            if path_error:
+                logger.info(f"Task {task_id}: {path_error}")
+                _persist_skipped(
+                    save_path,
+                    task_id or safe_task_id(task_id),
+                    path_error,
+                    item.get('expected_score', -1),
+                )
+                continue
+            task_file = item.get('artifact_readme', None)
             test_method = item.get('evaluator', None)
 
+            timeout_ms = timeout_ms_from_item(item)
+            gpu = gpu_from_item(item)
+            interactive = interactive_from_item(item) or interactive_default
+            enable_skill = enable_skill_from_item(item, enable_skill_default)
+            enable_subagent = enable_subagent_from_item(item, enable_subagent_default)
+            keep_container = _parse_bool(item.get('keep_container'), False)
+
             task = get_task(task_file)
 
-            logger.info(f"Task {task_id}: run_on_host={run_on_host}")
+            logger.info(
+                f"Task {task_id}: project_path={project_path}, run_on_host={run_on_host}, "
+                f"timeout_ms={timeout_ms}, gpu={gpu}, interactive={interactive}, "
+                f"enable_skill={enable_skill}, enable_subagent={enable_subagent}, keep_container={keep_container}"
+            )
 
             result = run_eval(
                 deployment=deployment,
@@ -63,12 +123,36 @@ def main(file_path, model, agent, save_path):
                 agent_path=agent,
                 test_method=test_method,
                 save_path=save_path,
-                run_on_host=run_on_host,  # Pass the flag
+                run_on_host=run_on_host,
+                timeout_ms=timeout_ms,
+                gpu=gpu,
+                interactive=interactive,
+                enable_skill=enable_skill,
+                enable_subagent=enable_subagent,
+                keep_container=keep_container,
             )
 
             result['expected_score'] = item.get('expected_score', -1)
+            result['timestamp'] = result.get('timestamp') or datetime.now().isoformat()
             with open(f'{save_path}/result.jsonl', 'a+', encoding='utf-8') as fw:
-                fw.write(json.dumps(result) + '\n')
+                fw.write(json.dumps(result, ensure_ascii=False) + '\n')
+
+            # When using ae_agent, also write per-task AE report (same as standalone ae-agent).
+            if _is_ae_agent(agent):
+                safe_id = safe_task_id(task_id)
+                log_path = result.get('log_file') or '(log not captured when run via benchmark)'
+                agent_summary = (result.get('agent_run_results') or '')[:8000] or '(No summary captured)'
+                try:
+                    write_task_report(save_path, safe_id, task_id, result, log_path, agent_summary)
+                except Exception as e:
+                    logger.warning('write_task_report failed: %s', e)
+
+    # Write summary.json (total/success counts) when ae_agent was used.
+    if _is_ae_agent(agent):
+        try:
+            compute_and_write_summary(save_path)
+        except Exception as e:
+            logger.warning('compute_and_write_summary failed: %s', e)
 
     success_count = 0
     total_count = 0
@@ -107,6 +191,21 @@ def main(file_path, model, agent, save_path):
         help='Model Name',
         default='claude-sonnet-4-5-20250929',
     )
+    parser.add_argument(
+        '--interactive',
+        action='store_true',
+        help='Enable interactive mode (continue giving agent instructions after task completes)',
+    )
+    parser.add_argument(
+        '--enable-skill',
+        action='store_true',
+        help='Enable Claude Agent SDK Skill (load from ~/.claude/skills/)',
+    )
+    parser.add_argument(
+        '--enable-subagent',
+        action='store_true',
+        help='Enable Claude Agent SDK Sub-agent (Task tool)',
+    )
     # Note that if your benchmark has multiple tasks, you need to add --task <task>
     # in your code to enable task selection.
     parser.add_argument('-t', '--task', help='specify task in scenarios', default=None)
@@ -136,4 +235,12 @@ def main(file_path, model, agent, save_path):
     save_path = os.path.abspath(os.path.expanduser(save_path))
     os.makedirs(save_path, exist_ok=True)
 
-    main(input_file, model_name, agent, save_path)
+    main(
+        input_file,
+        model_name,
+        agent,
+        save_path,
+        interactive_default=getattr(args, 'interactive', False),
+        enable_skill_default=getattr(args, 'enable_skill', False),
+        enable_subagent_default=getattr(args, 'enable_subagent', False),
+    )
diff --git a/benchmarks/arteval_bench/src/run_eval_in_env.py b/benchmarks/arteval_bench/src/run_eval_in_env.py
index 748addfc..d62356d4 100644
--- a/benchmarks/arteval_bench/src/run_eval_in_env.py
+++ b/benchmarks/arteval_bench/src/run_eval_in_env.py
@@ -54,15 +54,15 @@ def setup_claude_settings_on_host():
     
     settings = {
         "env": {
-            "BASH_MAX_TIMEOUT_MS": "172800000",  # 48 hours
-            "BASH_DEFAULT_TIMEOUT_MS": "172800000"
+            "BASH_MAX_TIMEOUT_MS": "345600000",  # 96 hours
+            "BASH_DEFAULT_TIMEOUT_MS": "345600000"
         }
     }
     
     with open(settings_file, 'w') as f:
         json.dump(settings, f, indent=2)
     
-    logger.info(f"Created {settings_file} with 48-hour timeout configuration.")
+    logger.info(f"Created {settings_file} with 96-hour timeout configuration.")
 
 
 def _is_ae_agent_path(agent_path) -> bool:
@@ -73,7 +73,97 @@ def _is_ae_agent_path(agent_path) -> bool:
     return p.endswith("ae_agent") or os.path.basename(p) == "ae_agent"
 
 
-async def run_eval_on_host(project_path, task_id, task, model, agent_path, test_method, save_path):
+def _stdin_is_tty() -> bool:
+    """True if stdin is a TTY (required for docker exec -it)."""
+    return getattr(sys.stdin, "isatty", lambda: False)()
+
+
+async def _get_container_id_from_runtime(runtime, deployment) -> str:
+    """Get Docker container ID from inside the container (hostname/cgroup) or from deployment."""
+    container_id = "unknown"
+    try:
+        res = await runtime.run_in_session(
+            BashAction(command='cat /etc/hostname 2>/dev/null || hostname 2>/dev/null || echo "unknown"', timeout=10.0)
+        )
+        container_id = str(getattr(res, "output", "")).strip()
+        try:
+            cgroup_res = await runtime.run_in_session(
+                BashAction(command='cat /proc/self/cgroup 2>/dev/null | grep docker | head -1 | cut -d/ -f3 | cut -c1-12 || echo ""', timeout=10.0)
+            )
+            cid = str(getattr(cgroup_res, "output", "")).strip()
+            if cid:
+                container_id = cid
+        except Exception:
+            pass
+        if hasattr(deployment, '_container_id') and getattr(deployment, '_container_id', None):
+            container_id = deployment._container_id
+        elif hasattr(deployment, 'container_id') and getattr(deployment, 'container_id', None):
+            container_id = deployment.container_id
+    except Exception as e:
+        logger.warning('Failed to get container ID: %s', e)
+    return container_id
+
+
+async def _run_ae_agent_interactive_foreground(
+    container_id: str,
+    model: str,
+    timeout_ms: int | None,
+    enable_skill: bool,
+    enable_subagent: bool,
+):
+    """Run ae_agent runner in foreground via docker exec -it (interactive mode). Returns MockResult with exit_code."""
+    try:
+        from agents.ae_agent.utils import resolve_timeout_ms
+        from agents.ae_agent.run_eval import _docker_exec_env_args
+    except ImportError:
+        _src = os.path.dirname(os.path.abspath(__file__))
+        if _src not in sys.path:
+            sys.path.insert(0, _src)
+        from agents.ae_agent.utils import resolve_timeout_ms
+        from agents.ae_agent.run_eval import _docker_exec_env_args
+
+    timeout_resolved = resolve_timeout_ms(timeout_ms)
+    exec_env = _docker_exec_env_args(
+        timeout_resolved,
+        enable_skill=enable_skill,
+        enable_subagent=enable_subagent,
+    )
+    exec_args = (
+        ['docker', 'exec', '-it']
+        + exec_env
+        + [container_id, 'python3', '-u', '/agent/runner.py', model, '/agent/current_task.txt', '--interactive']
+    )
+    logger.info('Running ae_agent in interactive mode (foreground): docker exec -it %s ...', container_id[:12])
+    proc = await asyncio.to_thread(
+        subprocess.run,
+        exec_args,
+        stdin=sys.stdin,
+        stdout=sys.stdout,
+        stderr=sys.stderr,
+    )
+    exit_code = proc.returncode if proc else -1
+
+    class MockResult:
+        def __init__(self, code, output=''):
+            self.exit_code = code
+            self.output = output or f'exit_code={code}'
+
+    return MockResult(exit_code, f'Interactive session (exit_code={exit_code})')
+
+
+async def run_eval_on_host(
+    project_path,
+    task_id,
+    task,
+    model,
+    agent_path,
+    test_method,
+    save_path,
+    timeout_ms=None,
+    interactive=False,
+    enable_skill=False,
+    enable_subagent=False,
+):
     """Run evaluation directly on host machine (no Docker container).
 
     When agent is ae_agent, delegates to ae_agent.run_agent_then_eval (agent run + evaluation script),
@@ -99,8 +189,11 @@ async def run_eval_on_host(project_path, task_id, task, model, agent_path, test_
             model=model,
             test_method=test_method,
             save_path=save_path,
-            timeout_ms=None,
+            timeout_ms=timeout_ms,
             skip_prereq_check=False,
+            interactive=interactive,
+            enable_skill=enable_skill,
+            enable_subagent=enable_subagent,
         )
         return result
 
@@ -169,8 +262,8 @@ async def run_eval_on_host(project_path, task_id, task, model, agent_path, test_
         setting_sources=["user"],
     )
 
-    os.environ['BASH_MAX_TIMEOUT_MS'] = '172800000'
-    os.environ['BASH_DEFAULT_TIMEOUT_MS'] = '172800000'
+    os.environ['BASH_MAX_TIMEOUT_MS'] = '345600000'
+    os.environ['BASH_DEFAULT_TIMEOUT_MS'] = '345600000'
 
     logger.info("Starting Claude Agent SDK (Host Mode)...")
 
@@ -234,16 +327,32 @@ async def run_eval_on_host(project_path, task_id, task, model, agent_path, test_
     return result
 
 
-async def run_eval_in_env(deployment, project_path, task_id, task, model, agent_path, test_method, save_path):
+async def run_eval_in_env(
+    deployment,
+    project_path,
+    task_id,
+    task,
+    model,
+    agent_path,
+    test_method,
+    save_path,
+    timeout_ms=None,
+    gpu=False,
+    interactive=False,
+    enable_skill=False,
+    enable_subagent=False,
+    keep_container=True,
+):
     """Spoiler: This function will work with any deployment."""
     await deployment.start()
     runtime = deployment.runtime
 
+    # Default 96h when timeout_ms not provided
+    runner_timeout_sec = (timeout_ms / 1000.0) if timeout_ms is not None else 345600.0
     if hasattr(runtime, "_config"):
         logger.info(f"Current RemoteRuntime timeout: {runtime._config.timeout}s")
-        # 48 hours = 172800s (aligned with Bash command timeout)
-        runtime._config.timeout = 172800.0
-        logger.info(f"Overriding RemoteRuntime timeout to {runtime._config.timeout}s (48 hours)")
+        runtime._config.timeout = runner_timeout_sec
+        logger.info(f"Overriding RemoteRuntime timeout to {runtime._config.timeout}s")
 
     # Issue a few one-off commands, similar to `subprocess.run()`
     logger.info(await runtime.execute(Command(command=['echo', 'Hello, world!'])))
@@ -321,6 +430,10 @@ async def run_eval_in_env(deployment, project_path, task_id, task, model, agent_
             parts.append(f"export ANTHROPIC_FOUNDRY_BASE_URL='{escaped_url}'")
         if os.environ.get('CLAUDE_CODE_USE_FOUNDRY') == '1':
             parts.append("export CLAUDE_CODE_USE_FOUNDRY=1")
+        if enable_skill:
+            parts.append("export AE_ENABLE_SKILL=1")
+        if enable_subagent:
+            parts.append("export AE_ENABLE_SUBAGENT=1")
         if parts:
             set_env_cmd = " && ".join(parts)
             logger.info('Setting Anthropic/Foundry API key and env in container...')
@@ -342,127 +455,146 @@ async def run_eval_in_env(deployment, project_path, task_id, task, model, agent_
         logger.info('Task file uploaded to /agent/current_task.txt for ae_agent.')
 
     logger.info('Running runner script...')
-    runner_timeout = 172800.0 if is_long_running_agent else 1200.0  # 48h for claude_sdk/ae_agent
-
-    if is_long_running_agent:
-        # Live log monitoring: run runner in background, poll log file periodically
-        await runtime.run_in_session(BashAction(command='rm -f /agent/runner.live.log && touch /agent/runner.live.log', timeout=10.0))
-
-        # ae_agent: use task file to avoid shell quoting; others pass task string
-        if is_ae_agent:
-            start_cmd = (
-                'stdbuf -oL -eL /agent/runner.sh "' + model + '" /agent/current_task.txt > /agent/runner.live.log 2>&1 & '
-                'RUNNER_PID=$!; '
-                'sleep 1; '
-                'echo RUNNER_PID=$RUNNER_PID'
-            )
-        else:
-            start_cmd = (
-                f'bash -c "stdbuf -oL -eL /agent/runner.sh \\"{model}\\" \\"{task}\\" > /agent/runner.live.log 2>&1 & '
-                'RUNNER_PID=$!; '
-                'sleep 1; '
-                'echo RUNNER_PID=$RUNNER_PID"'
-            )
-        start_res = await runtime.run_in_session(BashAction(command=start_cmd, timeout=30.0))
-        start_output = str(getattr(start_res, "output", "")).strip()
-
-        pid = None
-        for line in start_output.split('\n'):
-            if 'RUNNER_PID=' in line:
-                pid = line.split('RUNNER_PID=', 1)[1].strip()
-                break
-
-        if not pid or not pid.isdigit():
-            # Fallback: find PID by process name after short delay
-            await asyncio.sleep(2)
-            ps_res = await runtime.run_in_session(
-                BashAction(command="ps aux | grep '[r]unner.py' | awk '{print $2}' | head -1", timeout=10.0)
-            )
-            pid = str(getattr(ps_res, "output", "")).strip()
-        
-        logger.info(f'{agent_label} runner started with pid: {pid}')
-
-        await asyncio.sleep(2)  # Allow log file to have content
-
-        elapsed = 0.0
-        poll_interval = 10.0  # Poll every 10s for live log
-        run_results = None
-        last_log_content = ""  # Track last read content to avoid duplicate output
+    if timeout_ms is not None:
+        runner_timeout = timeout_ms / 1000.0
+    else:
+        runner_timeout = 345600.0 if is_long_running_agent else 1200.0  # 96h for long-running agents
 
-        while elapsed < runner_timeout:
+    run_results = None
+    # Docker + interactive: run ae_agent in foreground via docker exec -it (same as standalone ae-agent).
+    if is_ae_agent and interactive and _stdin_is_tty():
+        container_id_early = await _get_container_id_from_runtime(runtime, deployment)
+        if container_id_early and container_id_early != "unknown":
             try:
-                log_res = await runtime.run_in_session(
-                    BashAction(command='cat /agent/runner.live.log 2>/dev/null || echo ""', timeout=30.0)
+                run_results = await _run_ae_agent_interactive_foreground(
+                    container_id_early, model, timeout_ms, enable_skill, enable_subagent
                 )
-                current_log_content = str(getattr(log_res, "output", "")).strip()
-
-                if current_log_content and current_log_content != last_log_content:
-                    if last_log_content and current_log_content.startswith(last_log_content):
-                        new_content = current_log_content[len(last_log_content):].strip()
-                        if new_content:
-                            logger.info(f'[{agent_label} live log @ {elapsed:.0f}s ({elapsed/60:.1f} min)]\n{new_content}')
-                    else:
-                        logger.info(f'[{agent_label} live log @ {elapsed:.0f}s ({elapsed/60:.1f} min)]\n{current_log_content}')
-                    last_log_content = current_log_content
-                elif elapsed % 300 == 0 and elapsed > 0:
-                    logger.info(f'[{agent_label} still running @ {elapsed:.0f}s ({elapsed/60:.1f} min), no new output]')
+                logger.info('ae_agent interactive session finished with exit_code=%s', run_results.exit_code)
             except Exception as e:
-                logger.info(f'Failed to read {agent_label} live log: {e}')
-
-            if pid and pid.isdigit():
-                ps_res = await runtime.run_in_session(
-                    BashAction(command=f'ps -p {pid} >/dev/null 2>&1; echo $?', timeout=10.0)
+                logger.warning('ae_agent interactive foreground failed: %s', e)
+        else:
+            logger.warning('Cannot get container ID for interactive mode; falling back to non-interactive.')
+
+    if run_results is None:
+        if is_long_running_agent:
+            # Live log monitoring: run runner in background, poll log file periodically
+            await runtime.run_in_session(BashAction(command='rm -f /agent/runner.live.log && touch /agent/runner.live.log', timeout=10.0))
+
+            # ae_agent: use task file to avoid shell quoting; others pass task string
+            if is_ae_agent:
+                start_cmd = (
+                    'stdbuf -oL -eL /agent/runner.sh "' + model + '" /agent/current_task.txt > /agent/runner.live.log 2>&1 & '
+                    'RUNNER_PID=$!; '
+                    'sleep 1; '
+                    'echo RUNNER_PID=$RUNNER_PID'
                 )
-                ps_code = str(getattr(ps_res, "output", "")).strip()
-                if ps_code != "0":
-                    wait_res = await runtime.run_in_session(
-                        BashAction(command=f'wait {pid} 2>/dev/null; echo $?', timeout=30.0)
-                    )
-                    exit_code_str = str(getattr(wait_res, "output", "")).strip()
-
-                    class MockResult:
-                        def __init__(self, code):
-                            self.exit_code = int(code) if code.isdigit() else 0
-                            self.output = f'exit_code={self.exit_code}'
-                    run_results = MockResult(exit_code_str)
-                    logger.info(f'{agent_label} runner finished with exit code: {run_results.exit_code}')
-                    break
             else:
-                ps_res = await runtime.run_in_session(
-                    BashAction(command="ps aux | grep '[r]unner.py' | wc -l", timeout=10.0)
+                start_cmd = (
+                    f'bash -c "stdbuf -oL -eL /agent/runner.sh \\"{model}\\" \\"{task}\\" > /agent/runner.live.log 2>&1 & '
+                    'RUNNER_PID=$!; '
+                    'sleep 1; '
+                    'echo RUNNER_PID=$RUNNER_PID"'
                 )
-                proc_count = str(getattr(ps_res, "output", "")).strip()
-                if proc_count == "0" or not proc_count.isdigit() or int(proc_count) == 0:
-                    logger.info(f'{agent_label} runner process not found, assuming finished')
-                    class MockResult:
-                        def __init__(self):
-                            self.exit_code = 0
-                            self.output = 'exit_code=0'
-                    run_results = MockResult()
+            start_res = await runtime.run_in_session(BashAction(command=start_cmd, timeout=30.0))
+            start_output = str(getattr(start_res, "output", "")).strip()
+
+            pid = None
+            for line in start_output.split('\n'):
+                if 'RUNNER_PID=' in line:
+                    pid = line.split('RUNNER_PID=', 1)[1].strip()
                     break
 
-            await asyncio.sleep(poll_interval)
-            elapsed += poll_interval
+            if not pid or not pid.isdigit():
+                # Fallback: find PID by process name after short delay
+                await asyncio.sleep(2)
+                ps_res = await runtime.run_in_session(
+                    BashAction(command="ps aux | grep '[r]unner.py' | awk '{print $2}' | head -1", timeout=10.0)
+                )
+                pid = str(getattr(ps_res, "output", "")).strip()
+
+            logger.info(f'{agent_label} runner started with pid: {pid}')
+
+            await asyncio.sleep(2)  # Allow log file to have content
+
+            elapsed = 0.0
+            poll_interval = 10.0  # Poll every 10s for live log
+            run_results = None
+            last_log_content = ""  # Track last read content to avoid duplicate output
 
-        if run_results is None:
-            # Timeout: try to kill process and capture final log
-            if pid and pid.isdigit():
+            while elapsed < runner_timeout:
                 try:
-                    await runtime.run_in_session(BashAction(command=f'kill -TERM {pid} 2>/dev/null || kill -9 {pid} 2>/dev/null || true', timeout=10.0))
-                except Exception:
-                    pass
-            try:
-                tail_log = await runtime.run_in_session(
-                    BashAction(command='tail -n 200 /agent/runner.live.log', timeout=30.0)
-                )
-                logger.info(f'{agent_label} live log tail (on timeout):\n{tail_log}')
-            except Exception as e:
-                logger.info(f'Failed to read {agent_label} live log after timeout: {e}')
-            raise TimeoutError(f'{agent_label} runner exceeded timeout {runner_timeout}s')
+                    log_res = await runtime.run_in_session(
+                        BashAction(command='cat /agent/runner.live.log 2>/dev/null || echo ""', timeout=30.0)
+                    )
+                    current_log_content = str(getattr(log_res, "output", "")).strip()
+
+                    if current_log_content and current_log_content != last_log_content:
+                        if last_log_content and current_log_content.startswith(last_log_content):
+                            new_content = current_log_content[len(last_log_content):].strip()
+                            if new_content:
+                                logger.info(f'[{agent_label} live log @ {elapsed:.0f}s ({elapsed/60:.1f} min)]\n{new_content}')
+                        else:
+                            logger.info(f'[{agent_label} live log @ {elapsed:.0f}s ({elapsed/60:.1f} min)]\n{current_log_content}')
+                        last_log_content = current_log_content
+                    elif elapsed % 300 == 0 and elapsed > 0:
+                        logger.info(f'[{agent_label} still running @ {elapsed:.0f}s ({elapsed/60:.1f} min), no new output]')
+                except Exception as e:
+                    logger.info(f'Failed to read {agent_label} live log: {e}')
 
-    else:
-        runner_cmd = f'/agent/runner.sh "{model}" "{task}"'
-        run_results = await runtime.run_in_session(BashAction(command=runner_cmd, timeout=runner_timeout))
+                if pid and pid.isdigit():
+                    ps_res = await runtime.run_in_session(
+                        BashAction(command=f'ps -p {pid} >/dev/null 2>&1; echo $?', timeout=10.0)
+                    )
+                    ps_code = str(getattr(ps_res, "output", "")).strip()
+                    if ps_code != "0":
+                        wait_res = await runtime.run_in_session(
+                            BashAction(command=f'wait {pid} 2>/dev/null; echo $?', timeout=30.0)
+                        )
+                        exit_code_str = str(getattr(wait_res, "output", "")).strip()
+
+                        class MockResult:
+                            def __init__(self, code):
+                                self.exit_code = int(code) if code.isdigit() else 0
+                                self.output = f'exit_code={self.exit_code}'
+                        run_results = MockResult(exit_code_str)
+                        logger.info(f'{agent_label} runner finished with exit code: {run_results.exit_code}')
+                        break
+                else:
+                    ps_res = await runtime.run_in_session(
+                        BashAction(command="ps aux | grep '[r]unner.py' | wc -l", timeout=10.0)
+                    )
+                    proc_count = str(getattr(ps_res, "output", "")).strip()
+                    if proc_count == "0" or not proc_count.isdigit() or int(proc_count) == 0:
+                        logger.info(f'{agent_label} runner process not found, assuming finished')
+                        class MockResult:
+                            def __init__(self):
+                                self.exit_code = 0
+                                self.output = 'exit_code=0'
+                        run_results = MockResult()
+                        break
+
+                await asyncio.sleep(poll_interval)
+                elapsed += poll_interval
+
+            if run_results is None:
+                # Timeout: try to kill process and capture final log
+                if pid and pid.isdigit():
+                    try:
+                        await runtime.run_in_session(BashAction(command=f'kill -TERM {pid} 2>/dev/null || kill -9 {pid} 2>/dev/null || true', timeout=10.0))
+                    except Exception:
+                        pass
+                try:
+                    tail_log = await runtime.run_in_session(
+                        BashAction(command='tail -n 200 /agent/runner.live.log', timeout=30.0)
+                    )
+                    logger.info(f'{agent_label} live log tail (on timeout):\n{tail_log}')
+                except Exception as e:
+                    logger.info(f'Failed to read {agent_label} live log after timeout: {e}')
+                raise TimeoutError(f'{agent_label} runner exceeded timeout {runner_timeout}s')
+
+        else:
+            runner_cmd = f'/agent/runner.sh "{model}" "{task}"'
+            run_results = await runtime.run_in_session(BashAction(command=runner_cmd, timeout=runner_timeout))
     logger.info(f"agent's run results: {run_results}")
     logger.info('Runner script finished.')
 
@@ -516,54 +648,49 @@ def __init__(self):
             'status': f'error: {str(e)}',
         }
 
-    # For long-running agents: keep container running for inspection
+    # For long-running agents: sync+stop (when keep_container=False) or keep container for inspection
     if is_long_running_agent:
-        logger.info('=' * 80)
-        logger.info(f'Keeping Docker container running for {agent_label} (for debugging purposes).')
-
-        container_id = "unknown"
-        container_name = "unknown"
-        try:
-            container_id_res = await runtime.run_in_session(
-                BashAction(command='cat /etc/hostname 2>/dev/null || hostname 2>/dev/null || echo "unknown"', timeout=10.0)
-            )
-            container_id = str(getattr(container_id_res, "output", "")).strip()
+        container_id = await _get_container_id_from_runtime(runtime, deployment)
+        container_name = (
+            getattr(deployment, '_container_name', None)
+            or getattr(deployment, 'container_name', None)
+            or 'unknown'
+        )
 
+        if is_ae_agent and not keep_container and container_id and container_id != "unknown":
+            # Original artifact-agent behavior: sync workspace, commit image, stop container
             try:
-                docker_info_res = await runtime.run_in_session(
-                    BashAction(command='cat /proc/self/cgroup 2>/dev/null | grep docker | head -1 | cut -d/ -f3 | cut -c1-12 || echo ""', timeout=10.0)
-                )
-                docker_container_id = str(getattr(docker_info_res, "output", "")).strip()
-                if docker_container_id:
-                    container_id = docker_container_id
-            except Exception:
-                pass
-
-            if hasattr(deployment, '_container_id'):
-                container_id = deployment._container_id
-            elif hasattr(deployment, 'container_id'):
-                container_id = deployment.container_id
-            if hasattr(deployment, '_container_name'):
-                container_name = deployment._container_name
-            elif hasattr(deployment, 'container_name'):
-                container_name = deployment.container_name
-        except Exception as e:
-            logger.warning(f'Failed to get container information: {e}')
-        
-        logger.info(f'Container Information:')
-        logger.info(f'  Container ID: {container_id}')
-        logger.info(f'  Container Name: {container_name}')
-        logger.info(f'  Task ID: {task_id}')
-        logger.info(f'  Project Path: {project_path}')
-        logger.info(f'  To inspect the container, use: docker exec -it {container_id} /bin/bash')
-        logger.info(f'  Or find container by name/image and inspect manually')
-        logger.info(f'  NOTE: Container will remain running. To stop it manually, use: docker stop {container_id}')
-        logger.info(f'  WARNING: Remember to clean up containers to save storage space!')
-        logger.info('=' * 80)
-
-        result['container_id'] = container_id
-        result['container_name'] = container_name
-        result['container_kept'] = True
+                from agents.ae_agent.run_eval import save_container_after_run
+                saved_image, container_stopped = save_container_after_run(container_id, project_path, task_id)
+                result['saved_image'] = saved_image
+                result['container_stopped'] = container_stopped
+                result['container_id'] = container_id
+                result['container_kept'] = False
+                logger.info(f'ae_agent: synced workspace, saved image={saved_image}, stopped={container_stopped}')
+            except Exception as e:
+                logger.warning(f'save_container_after_run failed: {e}')
+                result['container_id'] = container_id
+                result['container_kept'] = True
+            try:
+                await deployment.stop()
+            except Exception as e:
+                logger.warning(f'deployment.stop() failed: {e}')
+        elif keep_container:
+            logger.info('=' * 80)
+            logger.info(f'Keeping Docker container running for {agent_label} (for debugging purposes).')
+            logger.info(f'Container ID: {container_id}')
+            logger.info(f'Task ID: {task_id}')
+            logger.info(f'Project Path: {project_path}')
+            logger.info(f'  To inspect: docker exec -it {container_id} /bin/bash')
+            logger.info(f'  To stop: docker stop {container_id}')
+            logger.info('=' * 80)
+            result['container_id'] = container_id
+            result['container_name'] = container_name
+            result['container_kept'] = True
+        else:
+            await deployment.stop()
+            result['container_id'] = container_id
+            result['container_kept'] = False
     else:
         await deployment.stop()
         result['container_kept'] = False
@@ -572,9 +699,25 @@ def __init__(self):
     return result
 
 
-def run_eval(deployment, project_path, task_id, task, model, agent_path, test_method, save_path, run_on_host=False):
+def run_eval(
+    deployment,
+    project_path,
+    task_id,
+    task,
+    model,
+    agent_path,
+    test_method,
+    save_path,
+    run_on_host=False,
+    timeout_ms=None,
+    gpu=False,
+    interactive=False,
+    enable_skill=False,
+    enable_subagent=False,
+    keep_container=True,
+):
     """Run evaluation either on host or in Docker container.
-    
+
     Args:
         deployment: Docker image to use (ignored if run_on_host=True)
         project_path: Path to the artifact project
@@ -585,36 +728,67 @@ def run_eval(deployment, project_path, task_id, task, model, agent_path, test_me
         test_method: Evaluation command
         save_path: Path to save results
         run_on_host: If True, run directly on host machine instead of Docker
+        timeout_ms: Per-task timeout in milliseconds (None = default 96h for long-running agents)
+        gpu: If True, pass --gpus all to Docker (Docker mode only)
+        interactive: If True, enable interactive mode after task (ae_agent only)
+        enable_skill: If True, enable Claude Agent SDK Skill (ae_agent only)
+        enable_subagent: If True, enable Claude Agent SDK Sub-agent (ae_agent only)
+        keep_container: If False and ae_agent, sync workspace + commit image + stop container after run
     """
-    
+
     if run_on_host:
-        # Run directly on host machine (no Docker container)
         logger.info(f"Task {task_id} configured to run on HOST machine (run_on_host=True)")
         return asyncio.run(
-            run_eval_on_host(project_path, task_id, task, model, agent_path, test_method, save_path)
+            run_eval_on_host(
+                project_path,
+                task_id,
+                task,
+                model,
+                agent_path,
+                test_method,
+                save_path,
+                timeout_ms=timeout_ms,
+                interactive=interactive,
+                enable_skill=enable_skill,
+                enable_subagent=enable_subagent,
+            )
         )
-    
-    # Run in Docker container (original behavior)
+
+    # Run in Docker container
     image = deployment or 'bastoica/ae-agent-ubuntu24.04:latest'
 
-    # Enable privileged mode for Docker-in-Docker scenarios (e.g., Kind clusters)
-    # This is required for Kubernetes-based artifact evaluations like Acto
-    # Additional args for cgroups v2 compatibility:
-    # - --cgroupns=host: Share cgroup namespace with host (required for Kind in cgroups v2)
-    # - Environment variables for Kind cgroups v2 compatibility
+    docker_args = [
+        '--privileged',
+        '--cgroupns=host',
+        '-e', 'KIND_EXPERIMENTAL_CONTAINERD_SNAPSHOTTER=native',
+    ]
+    if gpu:
+        docker_args.extend(['--gpus', 'all'])
+
     config = DockerDeploymentConfig(
         image=image,
         startup_timeout=1200.0,
-        docker_args=[
-            '--privileged',  # Required for Kind cluster creation
-            '--cgroupns=host',  # Required for Kind nodes to start with cgroups v2
-            '-e', 'KIND_EXPERIMENTAL_CONTAINERD_SNAPSHOTTER=native',  # Better cgroups v2 support
-        ],
+        docker_args=docker_args,
     )
     deployment_obj = config.get_deployment()
 
     return asyncio.run(
-        run_eval_in_env(deployment_obj, project_path, task_id, task, model, agent_path, test_method, save_path)
+        run_eval_in_env(
+            deployment_obj,
+            project_path,
+            task_id,
+            task,
+            model,
+            agent_path,
+            test_method,
+            save_path,
+            timeout_ms=timeout_ms,
+            gpu=gpu,
+            interactive=interactive,
+            enable_skill=enable_skill,
+            enable_subagent=enable_subagent,
+            keep_container=keep_container,
+        )
     )
 
 

From d9af6f102622aa6ec810ff66ada33e4cb6ef4473 Mon Sep 17 00:00:00 2001
From: couen <2631403308@qq.com>
Date: Sun, 1 Mar 2026 14:20:39 +0000
Subject: [PATCH 7/7] Fix evaluator invocation: run Python scripts with python
 from repo root

ArtEval evaluator field in JSONL points to _agent_eval/main.py. The code was
running it as a bare shell command, causing "No such file or directory" (127).
Now run with "cd /repo && python <evaluator_path>" in Docker and
"cd <project_path> && python <evaluator_path>" on host when path ends with .py.

- run_eval_in_env.py: build eval_cmd with cd /repo and python for .py paths
- ae_agent/run_eval.py: same for run_agent_then_eval (host path)
---
 benchmarks/arteval_bench/src/agents/ae_agent/run_eval.py | 6 +++++-
 benchmarks/arteval_bench/src/run_eval_in_env.py          | 8 +++++++-
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/benchmarks/arteval_bench/src/agents/ae_agent/run_eval.py b/benchmarks/arteval_bench/src/agents/ae_agent/run_eval.py
index aaba73cc..75be027c 100644
--- a/benchmarks/arteval_bench/src/agents/ae_agent/run_eval.py
+++ b/benchmarks/arteval_bench/src/agents/ae_agent/run_eval.py
@@ -270,7 +270,11 @@ async def _run_agent_then_eval_async(
     # 2. Run evaluation script if provided
     if test_method and test_method.strip():
         try:
-            eval_cmd = f'cd {project_path} && {test_method}'
+            # Evaluator from JSONL is a path to main.py; run with python from project root.
+            if test_method.strip().endswith('.py'):
+                eval_cmd = f'cd {project_path} && python {test_method.strip()}'
+            else:
+                eval_cmd = f'cd {project_path} && {test_method}'
             eval_result = subprocess.run(
                 eval_cmd,
                 shell=True,
diff --git a/benchmarks/arteval_bench/src/run_eval_in_env.py b/benchmarks/arteval_bench/src/run_eval_in_env.py
index d62356d4..afc5a22b 100644
--- a/benchmarks/arteval_bench/src/run_eval_in_env.py
+++ b/benchmarks/arteval_bench/src/run_eval_in_env.py
@@ -626,8 +626,14 @@ def __init__(self):
         else:
             logger.warning(f'No _agent_eval directories found in {project_path}')
 
+    # Run evaluator: JSONL evaluator is a path to main.py (e.g. sosp23_acto/_agent_eval/main.py);
+    # must run from /repo with `python <path>` so the script is executed correctly.
+    if test_method.strip().endswith('.py'):
+        eval_cmd = f"cd /repo && python {test_method.strip()}"
+    else:
+        eval_cmd = f"cd /repo && {test_method}"
     try:
-        test_output = await runtime.run_in_session(BashAction(command=test_method))
+        test_output = await runtime.run_in_session(BashAction(command=eval_cmd))
         logger.info(test_output)
         result = {
             'task': task,