Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,6 @@
logs/
.claude/scheduled_tasks.lock
.loopal/worktrees/
runs/
**/__pycache__/
benchmarks/terminal_bench/bin/
1 change: 1 addition & 0 deletions BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ rust_binary(
"//crates/loopal-protocol",
"//crates/loopal-tool-api",
"//crates/loopal-config",
"//crates/loopal-telemetry",
"//crates/loopal-runtime",
"//crates/loopal-storage",
"//crates/loopal-session",
Expand Down
7 changes: 6 additions & 1 deletion MODULE.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -100,8 +100,13 @@ crate.spec(package = "indexmap", version = "2")
crate.spec(package = "uuid", version = "1", features = ["v4", "serde"])
crate.spec(package = "chrono", version = "0.4", features = ["serde"])
crate.spec(package = "tracing", version = "0.1")
crate.spec(package = "tracing-subscriber", version = "0.3", features = ["env-filter"])
crate.spec(package = "tracing-subscriber", version = "0.3", features = ["env-filter", "registry"])
crate.spec(package = "tracing-appender", version = "0.2")
crate.spec(package = "opentelemetry", version = "0.28", features = ["trace", "metrics", "logs"])
crate.spec(package = "opentelemetry_sdk", version = "0.28", features = ["trace", "metrics", "logs", "rt-tokio"])
crate.spec(package = "opentelemetry-otlp", version = "0.28", features = ["trace", "metrics", "logs", "grpc-tonic"])
crate.spec(package = "tracing-opentelemetry", version = "0.29")
crate.spec(package = "opentelemetry-appender-tracing", version = "0.28")
crate.spec(package = "unicode-width", version = "0.2")
crate.spec(package = "textwrap", version = "0.16")
crate.spec(package = "pulldown-cmark", version = "0.12", default_features = False)
Expand Down
954 changes: 685 additions & 269 deletions MODULE.bazel.lock

Large diffs are not rendered by default.

Empty file added benchmarks/__init__.py
Empty file.
12 changes: 12 additions & 0 deletions benchmarks/terminal_bench/Dockerfile.build
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
FROM --platform=linux/amd64 gcr.io/bazel-public/bazel:8.1.0

USER root
RUN apt-get update -qq && apt-get install -y -qq libssl-dev pkg-config && rm -rf /var/lib/apt/lists/*

WORKDIR /workspace
COPY . /workspace/

RUN bazel build //:loopal -c opt

# Copy binary to a known location
RUN cp bazel-bin/loopal /usr/local/bin/loopal && chmod +x /usr/local/bin/loopal
17 changes: 17 additions & 0 deletions benchmarks/terminal_bench/Dockerfile.build-arm64
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
FROM --platform=linux/arm64 debian:bookworm

RUN apt-get update && apt-get install -y \
curl git libssl-dev pkg-config build-essential \
python3 python3-pip zip unzip openjdk-17-jdk-headless \
&& rm -rf /var/lib/apt/lists/*

# Install Bazelisk (auto-downloads correct Bazel version)
RUN curl -Lo /usr/local/bin/bazel https://github.com/bazelbuild/bazelisk/releases/latest/download/bazelisk-linux-arm64 \
&& chmod +x /usr/local/bin/bazel

WORKDIR /workspace
COPY . /workspace/

RUN bazel build //:loopal -c opt

RUN cp bazel-bin/loopal /usr/local/bin/loopal && chmod +x /usr/local/bin/loopal
Empty file.
118 changes: 118 additions & 0 deletions benchmarks/terminal_bench/claude_code_compare_agent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
"""Claude Code comparison adapter for Terminal-Bench.

Same BENCHMARK_SUFFIX as Loopal adapter, routed through the same API proxy,
so the only variable is the agent harness itself (Claude Code vs Loopal).
"""

import inspect
import os
import shlex
from pathlib import Path

from terminal_bench.agents.installed_agents.claude_code.claude_code_agent import (
ClaudeCodeAgent,
)
from terminal_bench.terminal.models import TerminalCommand

# Resolve the directory of the ORIGINAL ClaudeCodeAgent class.
# _get_templated_script_path uses inspect.getfile(self.__class__) which would
# resolve to THIS file (the subclass). We need to point to the original.
_ORIGINAL_AGENT_DIR = Path(inspect.getfile(ClaudeCodeAgent)).parent


class ClaudeCodeCompareAgent(ClaudeCodeAgent):
"""Claude Code with same BENCHMARK_SUFFIX and API proxy as Loopal."""

# Identical to LoopalAgent.BENCHMARK_SUFFIX
BENCHMARK_SUFFIX = (
"\n\n---\n"
"IMPORTANT — follow this workflow IN ORDER:\n"
"1. FIRST, find and read ALL test/verification files (tests/, "
"test_*.py, run-tests.sh). Extract the exact expected output: "
"field names, format strings, file paths, config locations, "
"error codes. These are the ground truth — do not guess.\n"
"2. Implement the solution based on what the tests expect.\n"
"3. Verify with objective commands: "
"`python3 -c \"print(repr(open('f').read()))\"` for file content, "
"`ls -la` for permissions, `curl` for endpoints. One pass.\n"
"4. If tests exist, run them. If any fail, read the assertion, "
"fix, re-run.\n\n"
"Rules:\n"
"- Non-interactive. Do NOT use AskUser. Use best judgment.\n"
"- chmod +x any script files immediately after creation.\n"
"- Background services: use nohup/tmux so they survive after "
"your session.\n"
"- Only modify files directly related to the task. "
"git diff --name-only to confirm scope.\n"
"- If blocked, try alternatives instead of giving up.\n"
)

@property
def _install_agent_script_path(self) -> Path:
"""Point to the original Claude Code setup template, not our subclass dir."""
from terminal_bench.utils.template_utils import render_setup_script
import tempfile

template_path = _ORIGINAL_AGENT_DIR / "claude-code-setup.sh.j2"
variables = self._get_template_variables()
script_content = render_setup_script(template_path, variables)

temp_file = tempfile.NamedTemporaryFile(
mode="w", suffix=".sh", delete=False
)
temp_file.write(script_content)
temp_file.close()
os.chmod(temp_file.name, 0o755)
return Path(temp_file.name)

@property
def _env(self) -> dict[str, str]:
api_key = os.environ.get("ANTHROPIC_API_KEY") or os.environ.get(
"ANTHROPIC_AUTH_TOKEN", ""
)
if not api_key:
raise EnvironmentError(
"ANTHROPIC_API_KEY or ANTHROPIC_AUTH_TOKEN is required."
)

env = {
"ANTHROPIC_API_KEY": api_key,
"FORCE_AUTO_BACKGROUND_TASKS": "1",
"ENABLE_BACKGROUND_TASKS": "1",
}

# Route through Docker-accessible proxy
if os.environ.get("ANTHROPIC_BASE_URL_DOCKER"):
env["ANTHROPIC_BASE_URL"] = os.environ["ANTHROPIC_BASE_URL_DOCKER"]
elif os.environ.get("ANTHROPIC_BASE_URL"):
import re

base_url = re.sub(
r"(https?://)(?:localhost|127\.0\.0\.1|192\.168\.\d+\.\d+|"
r"10\.\d+\.\d+\.\d+|172\.(?:1[6-9]|2\d|3[01])\.\d+\.\d+)",
r"\1host.docker.internal",
os.environ["ANTHROPIC_BASE_URL"],
)
env["ANTHROPIC_BASE_URL"] = base_url

if self._model_name:
env["ANTHROPIC_MODEL"] = self._model_name.removeprefix("anthropic/")
elif "ANTHROPIC_MODEL" in os.environ:
env["ANTHROPIC_MODEL"] = os.environ["ANTHROPIC_MODEL"]

return env

def _run_agent_commands(self, instruction: str) -> list[TerminalCommand]:
full_instruction = instruction + self.BENCHMARK_SUFFIX
escaped_instruction = shlex.quote(full_instruction)
return [
TerminalCommand(
command=f"claude --verbose --output-format stream-json "
f"-p {escaped_instruction} --allowedTools "
f"{' '.join(self.ALLOWED_TOOLS)}",
min_timeout_sec=0.0,
max_timeout_sec=float("inf"),
block=True,
append_enter=True,
),
]
169 changes: 169 additions & 0 deletions benchmarks/terminal_bench/loopal_agent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
"""Loopal agent adapter for Terminal-Bench."""

import os
import platform
import shlex
from pathlib import Path

from terminal_bench.agents.installed_agents.abstract_installed_agent import (
AbstractInstalledAgent,
)
from terminal_bench.terminal.models import TerminalCommand


class LoopalAgent(AbstractInstalledAgent):
"""Terminal-Bench adapter that runs Loopal in headless server mode."""

def __init__(self, model_name: str | None = None, *args, **kwargs):
super().__init__(*args, **kwargs)
self._model_name = model_name
self._binary_path = kwargs.get("binary_path")

@staticmethod
def name() -> str:
return "loopal"

@property
def _env(self) -> dict[str, str]:
# Loopal checks ANTHROPIC_API_KEY first, then ANTHROPIC_AUTH_TOKEN
api_key = os.environ.get("ANTHROPIC_API_KEY") or os.environ.get(
"ANTHROPIC_AUTH_TOKEN", ""
)
if not api_key:
raise EnvironmentError(
"ANTHROPIC_API_KEY or ANTHROPIC_AUTH_TOKEN is required."
)
env: dict[str, str] = {}
# Forward whichever key variant is set
if os.environ.get("ANTHROPIC_API_KEY"):
env["ANTHROPIC_API_KEY"] = os.environ["ANTHROPIC_API_KEY"]
if os.environ.get("ANTHROPIC_AUTH_TOKEN"):
env["ANTHROPIC_AUTH_TOKEN"] = os.environ["ANTHROPIC_AUTH_TOKEN"]
if os.environ.get("ANTHROPIC_BASE_URL"):
base_url = os.environ["ANTHROPIC_BASE_URL"]
# Docker containers can't access host LAN IPs directly.
# Replace private/localhost IPs with host.docker.internal.
import re

base_url = re.sub(
r"(https?://)(?:localhost|127\.0\.0\.1|192\.168\.\d+\.\d+|10\.\d+\.\d+\.\d+|172\.(?:1[6-9]|2\d|3[01])\.\d+\.\d+)",
r"\1host.docker.internal",
base_url,
)
env["ANTHROPIC_BASE_URL"] = base_url
# Also check for a Docker-specific override (highest priority)
if os.environ.get("ANTHROPIC_BASE_URL_DOCKER"):
env["ANTHROPIC_BASE_URL"] = os.environ["ANTHROPIC_BASE_URL_DOCKER"]
if self._model_name:
model = self._model_name.removeprefix("anthropic/")
env["LOOPAL_MODEL"] = model
elif "LOOPAL_MODEL" in os.environ:
env["LOOPAL_MODEL"] = os.environ["LOOPAL_MODEL"]
return env

@property
def _install_agent_script_path(self) -> Path:
return Path(__file__).parent / "setup.sh"

def _resolve_binary_path(self) -> Path:
"""Resolve the Loopal Linux binary path for the target architecture."""
if self._binary_path:
p = Path(self._binary_path)
if p.is_file():
return p
raise FileNotFoundError(
f"Loopal binary not found at: {self._binary_path}"
)

adapter_dir = Path(__file__).parent
project_root = adapter_dir.parent.parent
bin_dir = adapter_dir / "bin"

# Detect host architecture to pick the right binary.
# Docker Desktop on macOS ARM runs ARM64 Linux containers by default.
arch = platform.machine()
if arch in ("arm64", "aarch64"):
arch_suffix = "aarch64"
else:
arch_suffix = "x86_64"

candidates = [
bin_dir / f"loopal-linux-{arch_suffix}",
bin_dir / "loopal-linux-x86_64",
bin_dir / "loopal-linux-aarch64",
project_root / "bazel-bin" / "loopal",
]
for candidate in candidates:
if candidate.is_file():
return candidate

raise FileNotFoundError(
"Loopal Linux binary not found. Build it first with:\n"
" docker build --platform linux/amd64 "
"-f benchmarks/terminal_bench/Dockerfile.build -t loopal-build .\n"
"Or pass --agent-kwarg binary_path=/path/to/loopal"
)

def perform_task(
self,
instruction: str,
session,
logging_dir: Path | None = None,
):
"""Copy Loopal binary into the container, then run the standard flow."""
binary_path = self._resolve_binary_path()

# Copy the pre-built binary into the container before setup.sh runs.
# Pass Path object (not str) as copy_to_container expects PathLike.
session.copy_to_container(
binary_path,
container_dir="/installed-agent",
container_filename="loopal",
)

return super().perform_task(instruction, session, logging_dir)

BENCHMARK_SUFFIX = (
"\n\n---\n"
"IMPORTANT — follow this workflow IN ORDER:\n"
"1. FIRST, find and read ALL test/verification files (tests/, "
"test_*.py, run-tests.sh). Extract the exact expected output: "
"field names, format strings, file paths, config locations, "
"error codes. These are the ground truth — do not guess.\n"
"2. Implement the solution based on what the tests expect.\n"
"3. Verify with objective commands: "
"`python3 -c \"print(repr(open('f').read()))\"` for file content, "
"`ls -la` for permissions, `curl` for endpoints. One pass.\n"
"4. If tests exist, run them. If any fail, read the assertion, "
"fix, re-run.\n\n"
"Rules:\n"
"- Non-interactive. Do NOT use AskUser. Use best judgment.\n"
"- chmod +x any script files immediately after creation.\n"
"- Background services: use nohup/tmux so they survive after "
"your session.\n"
"- Only modify files directly related to the task. "
"git diff --name-only to confirm scope.\n"
"- If blocked, try alternatives instead of giving up.\n"
)

def _run_agent_commands(self, instruction: str) -> list[TerminalCommand]:
full_instruction = instruction + self.BENCHMARK_SUFFIX
escaped_instruction = shlex.quote(full_instruction)

cmd_parts = ["loopal", "--server", "-P", "bypass"]

if self._model_name:
model = self._model_name.removeprefix("anthropic/")
cmd_parts.extend(["-m", model])

cmd_parts.append(escaped_instruction)

return [
TerminalCommand(
command=" ".join(cmd_parts),
min_timeout_sec=0.0,
max_timeout_sec=float("inf"),
block=True,
append_enter=True,
),
]
9 changes: 9 additions & 0 deletions benchmarks/terminal_bench/setup.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/bin/bash
set -e

# Loopal binary is pre-copied to /installed-agent/loopal by the adapter.
chmod +x /installed-agent/loopal
cp /installed-agent/loopal /usr/local/bin/loopal

# Verify installation
loopal --help > /dev/null 2>&1 && echo "Loopal installed successfully" || echo "INSTALL_FAIL_STATUS"
Loading
Loading