AgentsMesh · yishuiliunian · Apr 10, 2026 · Apr 10, 2026 · Apr 10, 2026 · Apr 10, 2026
diff --git a/.gitignore b/.gitignore
@@ -20,3 +20,6 @@
 logs/
 .claude/scheduled_tasks.lock
 .loopal/worktrees/
+runs/
+**/__pycache__/
+benchmarks/terminal_bench/bin/
diff --git a/BUILD.bazel b/BUILD.bazel
@@ -16,6 +16,7 @@ rust_binary(
         "//crates/loopal-protocol",
         "//crates/loopal-tool-api",
         "//crates/loopal-config",
+        "//crates/loopal-telemetry",
         "//crates/loopal-runtime",
         "//crates/loopal-storage",
         "//crates/loopal-session",

diff --git a/MODULE.bazel b/MODULE.bazel
@@ -100,8 +100,13 @@ crate.spec(package = "indexmap", version = "2")
 crate.spec(package = "uuid", version = "1", features = ["v4", "serde"])
 crate.spec(package = "chrono", version = "0.4", features = ["serde"])
 crate.spec(package = "tracing", version = "0.1")
-crate.spec(package = "tracing-subscriber", version = "0.3", features = ["env-filter"])
+crate.spec(package = "tracing-subscriber", version = "0.3", features = ["env-filter", "registry"])
 crate.spec(package = "tracing-appender", version = "0.2")
+crate.spec(package = "opentelemetry", version = "0.28", features = ["trace", "metrics", "logs"])
+crate.spec(package = "opentelemetry_sdk", version = "0.28", features = ["trace", "metrics", "logs", "rt-tokio"])
+crate.spec(package = "opentelemetry-otlp", version = "0.28", features = ["trace", "metrics", "logs", "grpc-tonic"])
+crate.spec(package = "tracing-opentelemetry", version = "0.29")
+crate.spec(package = "opentelemetry-appender-tracing", version = "0.28")
 crate.spec(package = "unicode-width", version = "0.2")
 crate.spec(package = "textwrap", version = "0.16")
 crate.spec(package = "pulldown-cmark", version = "0.12", default_features = False)

diff --git a/MODULE.bazel.lock b/MODULE.bazel.lock
diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py
diff --git a/benchmarks/terminal_bench/Dockerfile.build b/benchmarks/terminal_bench/Dockerfile.build
@@ -0,0 +1,12 @@
+FROM --platform=linux/amd64 gcr.io/bazel-public/bazel:8.1.0
+
+USER root
+RUN apt-get update -qq && apt-get install -y -qq libssl-dev pkg-config && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /workspace
+COPY . /workspace/
+
+RUN bazel build //:loopal -c opt
+
+# Copy binary to a known location
+RUN cp bazel-bin/loopal /usr/local/bin/loopal && chmod +x /usr/local/bin/loopal
diff --git a/benchmarks/terminal_bench/Dockerfile.build-arm64 b/benchmarks/terminal_bench/Dockerfile.build-arm64
@@ -0,0 +1,17 @@
+FROM --platform=linux/arm64 debian:bookworm
+
+RUN apt-get update && apt-get install -y \
+    curl git libssl-dev pkg-config build-essential \
+    python3 python3-pip zip unzip openjdk-17-jdk-headless \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install Bazelisk (auto-downloads correct Bazel version)
+RUN curl -Lo /usr/local/bin/bazel https://github.com/bazelbuild/bazelisk/releases/latest/download/bazelisk-linux-arm64 \
+    && chmod +x /usr/local/bin/bazel
+
+WORKDIR /workspace
+COPY . /workspace/
+
+RUN bazel build //:loopal -c opt
+
+RUN cp bazel-bin/loopal /usr/local/bin/loopal && chmod +x /usr/local/bin/loopal
diff --git a/benchmarks/terminal_bench/__init__.py b/benchmarks/terminal_bench/__init__.py
diff --git a/benchmarks/terminal_bench/claude_code_compare_agent.py b/benchmarks/terminal_bench/claude_code_compare_agent.py
@@ -0,0 +1,118 @@
+"""Claude Code comparison adapter for Terminal-Bench.
+
+Same BENCHMARK_SUFFIX as Loopal adapter, routed through the same API proxy,
+so the only variable is the agent harness itself (Claude Code vs Loopal).
+"""
+
+import inspect
+import os
+import shlex
+from pathlib import Path
+
+from terminal_bench.agents.installed_agents.claude_code.claude_code_agent import (
+    ClaudeCodeAgent,
+)
+from terminal_bench.terminal.models import TerminalCommand
+
+# Resolve the directory of the ORIGINAL ClaudeCodeAgent class.
+# _get_templated_script_path uses inspect.getfile(self.__class__) which would
+# resolve to THIS file (the subclass). We need to point to the original.
+_ORIGINAL_AGENT_DIR = Path(inspect.getfile(ClaudeCodeAgent)).parent
+
+
+class ClaudeCodeCompareAgent(ClaudeCodeAgent):
+    """Claude Code with same BENCHMARK_SUFFIX and API proxy as Loopal."""
+
+    # Identical to LoopalAgent.BENCHMARK_SUFFIX
+    BENCHMARK_SUFFIX = (
+        "\n\n---\n"
+        "IMPORTANT — follow this workflow IN ORDER:\n"
+        "1. FIRST, find and read ALL test/verification files (tests/, "
+        "test_*.py, run-tests.sh). Extract the exact expected output: "
+        "field names, format strings, file paths, config locations, "
+        "error codes. These are the ground truth — do not guess.\n"
+        "2. Implement the solution based on what the tests expect.\n"
+        "3. Verify with objective commands: "
+        "`python3 -c \"print(repr(open('f').read()))\"` for file content, "
+        "`ls -la` for permissions, `curl` for endpoints. One pass.\n"
+        "4. If tests exist, run them. If any fail, read the assertion, "
+        "fix, re-run.\n\n"
+        "Rules:\n"
+        "- Non-interactive. Do NOT use AskUser. Use best judgment.\n"
+        "- chmod +x any script files immediately after creation.\n"
+        "- Background services: use nohup/tmux so they survive after "
+        "your session.\n"
+        "- Only modify files directly related to the task. "
+        "git diff --name-only to confirm scope.\n"
+        "- If blocked, try alternatives instead of giving up.\n"
+    )
+
+    @property
+    def _install_agent_script_path(self) -> Path:
+        """Point to the original Claude Code setup template, not our subclass dir."""
+        from terminal_bench.utils.template_utils import render_setup_script
+        import tempfile
+
+        template_path = _ORIGINAL_AGENT_DIR / "claude-code-setup.sh.j2"
+        variables = self._get_template_variables()
+        script_content = render_setup_script(template_path, variables)
+
+        temp_file = tempfile.NamedTemporaryFile(
+            mode="w", suffix=".sh", delete=False
+        )
+        temp_file.write(script_content)
+        temp_file.close()
+        os.chmod(temp_file.name, 0o755)
+        return Path(temp_file.name)
+
+    @property
+    def _env(self) -> dict[str, str]:
+        api_key = os.environ.get("ANTHROPIC_API_KEY") or os.environ.get(
+            "ANTHROPIC_AUTH_TOKEN", ""
+        )
+        if not api_key:
+            raise EnvironmentError(
+                "ANTHROPIC_API_KEY or ANTHROPIC_AUTH_TOKEN is required."
+            )
+
+        env = {
+            "ANTHROPIC_API_KEY": api_key,
+            "FORCE_AUTO_BACKGROUND_TASKS": "1",
+            "ENABLE_BACKGROUND_TASKS": "1",
+        }
+
+        # Route through Docker-accessible proxy
+        if os.environ.get("ANTHROPIC_BASE_URL_DOCKER"):
+            env["ANTHROPIC_BASE_URL"] = os.environ["ANTHROPIC_BASE_URL_DOCKER"]
+        elif os.environ.get("ANTHROPIC_BASE_URL"):
+            import re
+
+            base_url = re.sub(
+                r"(https?://)(?:localhost|127\.0\.0\.1|192\.168\.\d+\.\d+|"
+                r"10\.\d+\.\d+\.\d+|172\.(?:1[6-9]|2\d|3[01])\.\d+\.\d+)",
+                r"\1host.docker.internal",
+                os.environ["ANTHROPIC_BASE_URL"],
+            )
+            env["ANTHROPIC_BASE_URL"] = base_url
+
+        if self._model_name:
+            env["ANTHROPIC_MODEL"] = self._model_name.removeprefix("anthropic/")
+        elif "ANTHROPIC_MODEL" in os.environ:
+            env["ANTHROPIC_MODEL"] = os.environ["ANTHROPIC_MODEL"]
+
+        return env
+
+    def _run_agent_commands(self, instruction: str) -> list[TerminalCommand]:
+        full_instruction = instruction + self.BENCHMARK_SUFFIX
+        escaped_instruction = shlex.quote(full_instruction)
+        return [
+            TerminalCommand(
+                command=f"claude --verbose --output-format stream-json "
+                f"-p {escaped_instruction} --allowedTools "
+                f"{' '.join(self.ALLOWED_TOOLS)}",
+                min_timeout_sec=0.0,
+                max_timeout_sec=float("inf"),
+                block=True,
+                append_enter=True,
+            ),
+        ]
diff --git a/benchmarks/terminal_bench/loopal_agent.py b/benchmarks/terminal_bench/loopal_agent.py
@@ -0,0 +1,169 @@
+"""Loopal agent adapter for Terminal-Bench."""
+
+import os
+import platform
+import shlex
+from pathlib import Path
+
+from terminal_bench.agents.installed_agents.abstract_installed_agent import (
+    AbstractInstalledAgent,
+)
+from terminal_bench.terminal.models import TerminalCommand
+
+
+class LoopalAgent(AbstractInstalledAgent):
+    """Terminal-Bench adapter that runs Loopal in headless server mode."""
+
+    def __init__(self, model_name: str | None = None, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._model_name = model_name
+        self._binary_path = kwargs.get("binary_path")
+
+    @staticmethod
+    def name() -> str:
+        return "loopal"
+
+    @property
+    def _env(self) -> dict[str, str]:
+        # Loopal checks ANTHROPIC_API_KEY first, then ANTHROPIC_AUTH_TOKEN
+        api_key = os.environ.get("ANTHROPIC_API_KEY") or os.environ.get(
+            "ANTHROPIC_AUTH_TOKEN", ""
+        )
+        if not api_key:
+            raise EnvironmentError(
+                "ANTHROPIC_API_KEY or ANTHROPIC_AUTH_TOKEN is required."
+            )
+        env: dict[str, str] = {}
+        # Forward whichever key variant is set
+        if os.environ.get("ANTHROPIC_API_KEY"):
+            env["ANTHROPIC_API_KEY"] = os.environ["ANTHROPIC_API_KEY"]
+        if os.environ.get("ANTHROPIC_AUTH_TOKEN"):
+            env["ANTHROPIC_AUTH_TOKEN"] = os.environ["ANTHROPIC_AUTH_TOKEN"]
+        if os.environ.get("ANTHROPIC_BASE_URL"):
+            base_url = os.environ["ANTHROPIC_BASE_URL"]
+            # Docker containers can't access host LAN IPs directly.
+            # Replace private/localhost IPs with host.docker.internal.
+            import re
+
+            base_url = re.sub(
+                r"(https?://)(?:localhost|127\.0\.0\.1|192\.168\.\d+\.\d+|10\.\d+\.\d+\.\d+|172\.(?:1[6-9]|2\d|3[01])\.\d+\.\d+)",
+                r"\1host.docker.internal",
+                base_url,
+            )
+            env["ANTHROPIC_BASE_URL"] = base_url
+        # Also check for a Docker-specific override (highest priority)
+        if os.environ.get("ANTHROPIC_BASE_URL_DOCKER"):
+            env["ANTHROPIC_BASE_URL"] = os.environ["ANTHROPIC_BASE_URL_DOCKER"]
+        if self._model_name:
+            model = self._model_name.removeprefix("anthropic/")
+            env["LOOPAL_MODEL"] = model
+        elif "LOOPAL_MODEL" in os.environ:
+            env["LOOPAL_MODEL"] = os.environ["LOOPAL_MODEL"]
+        return env
+
+    @property
+    def _install_agent_script_path(self) -> Path:
+        return Path(__file__).parent / "setup.sh"
+
+    def _resolve_binary_path(self) -> Path:
+        """Resolve the Loopal Linux binary path for the target architecture."""
+        if self._binary_path:
+            p = Path(self._binary_path)
+            if p.is_file():
+                return p
+            raise FileNotFoundError(
+                f"Loopal binary not found at: {self._binary_path}"
+            )
+
+        adapter_dir = Path(__file__).parent
+        project_root = adapter_dir.parent.parent
+        bin_dir = adapter_dir / "bin"
+
+        # Detect host architecture to pick the right binary.
+        # Docker Desktop on macOS ARM runs ARM64 Linux containers by default.
+        arch = platform.machine()
+        if arch in ("arm64", "aarch64"):
+            arch_suffix = "aarch64"
+        else:
+            arch_suffix = "x86_64"
+
+        candidates = [
+            bin_dir / f"loopal-linux-{arch_suffix}",
+            bin_dir / "loopal-linux-x86_64",
+            bin_dir / "loopal-linux-aarch64",
+            project_root / "bazel-bin" / "loopal",
+        ]
+        for candidate in candidates:
+            if candidate.is_file():
+                return candidate
+
+        raise FileNotFoundError(
+            "Loopal Linux binary not found. Build it first with:\n"
+            "  docker build --platform linux/amd64 "
+            "-f benchmarks/terminal_bench/Dockerfile.build -t loopal-build .\n"
+            "Or pass --agent-kwarg binary_path=/path/to/loopal"
+        )
+
+    def perform_task(
+        self,
+        instruction: str,
+        session,
+        logging_dir: Path | None = None,
+    ):
+        """Copy Loopal binary into the container, then run the standard flow."""
+        binary_path = self._resolve_binary_path()
+
+        # Copy the pre-built binary into the container before setup.sh runs.
+        # Pass Path object (not str) as copy_to_container expects PathLike.
+        session.copy_to_container(
+            binary_path,
+            container_dir="/installed-agent",
+            container_filename="loopal",
+        )
+
+        return super().perform_task(instruction, session, logging_dir)
+
+    BENCHMARK_SUFFIX = (
+        "\n\n---\n"
+        "IMPORTANT — follow this workflow IN ORDER:\n"
+        "1. FIRST, find and read ALL test/verification files (tests/, "
+        "test_*.py, run-tests.sh). Extract the exact expected output: "
+        "field names, format strings, file paths, config locations, "
+        "error codes. These are the ground truth — do not guess.\n"
+        "2. Implement the solution based on what the tests expect.\n"
+        "3. Verify with objective commands: "
+        "`python3 -c \"print(repr(open('f').read()))\"` for file content, "
+        "`ls -la` for permissions, `curl` for endpoints. One pass.\n"
+        "4. If tests exist, run them. If any fail, read the assertion, "
+        "fix, re-run.\n\n"
+        "Rules:\n"
+        "- Non-interactive. Do NOT use AskUser. Use best judgment.\n"
+        "- chmod +x any script files immediately after creation.\n"
+        "- Background services: use nohup/tmux so they survive after "
+        "your session.\n"
+        "- Only modify files directly related to the task. "
+        "git diff --name-only to confirm scope.\n"
+        "- If blocked, try alternatives instead of giving up.\n"
+    )
+
+    def _run_agent_commands(self, instruction: str) -> list[TerminalCommand]:
+        full_instruction = instruction + self.BENCHMARK_SUFFIX
+        escaped_instruction = shlex.quote(full_instruction)
+
+        cmd_parts = ["loopal", "--server", "-P", "bypass"]
+
+        if self._model_name:
+            model = self._model_name.removeprefix("anthropic/")
+            cmd_parts.extend(["-m", model])
+
+        cmd_parts.append(escaped_instruction)
+
+        return [
+            TerminalCommand(
+                command=" ".join(cmd_parts),
+                min_timeout_sec=0.0,
+                max_timeout_sec=float("inf"),
+                block=True,
+                append_enter=True,
+            ),
+        ]
diff --git a/benchmarks/terminal_bench/setup.sh b/benchmarks/terminal_bench/setup.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+set -e
+
+# Loopal binary is pre-copied to /installed-agent/loopal by the adapter.
+chmod +x /installed-agent/loopal
+cp /installed-agent/loopal /usr/local/bin/loopal
+
+# Verify installation
+loopal --help > /dev/null 2>&1 && echo "Loopal installed successfully" || echo "INSTALL_FAIL_STATUS"