From d5cb321cd0c222f2db3d9c16d6314f721c5bbefc Mon Sep 17 00:00:00 2001
From: hubertpysklo <hubert@uni.minerva.edu>
Date: Thu, 5 Mar 2026 21:06:34 +0530
Subject: [PATCH] Example

---
 examples/langchain_agent_benchmark.ipynb | 448 +++++++++++------------
 1 file changed, 223 insertions(+), 225 deletions(-)

diff --git a/examples/langchain_agent_benchmark.ipynb b/examples/langchain_agent_benchmark.ipynb
index ceb1857..e217d02 100644
--- a/examples/langchain_agent_benchmark.ipynb
+++ b/examples/langchain_agent_benchmark.ipynb
@@ -1,228 +1,226 @@
 {
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Agent-Diff Benchmark: LangChain Agent\n",
-    "\n",
-    "Run the [Agent-Diff benchmark](https://arxiv.org/abs/2602.11224) using LangChain's built-in agent with tool calling.\n",
-    "\n",
-    "Unlike the [ReAct notebook](react_agent_benchmark.ipynb) which uses a custom XML-tag loop, this notebook lets LangChain handle the agent loop via the model's native function-calling protocol.\n",
-    "\n",
-    "Two options are shown:\n",
-    "- **Option A** — Load tests from HuggingFace dataset (no server-side test suites needed)\n",
-    "- **Option B** — Load tests from Agent-Diff server test suites (used in production evaluations)\n",
-    "\n",
-    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/agent-diff-bench/agent-diff/blob/main/examples/langchain_agent_benchmark.ipynb)\n",
-    "\n",
-    "**Links:** [Paper](https://arxiv.org/abs/2602.11224) | [Dataset](https://huggingface.co/datasets/hubertmarek/agent-diff-bench) | [GitHub](https://github.com/agent-diff-bench/agent-diff)"
-   ]
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Agent-Diff Benchmark: LangChain Agent\n",
+        "\n",
+        "Run the [Agent-Diff benchmark](https://arxiv.org/abs/2602.11224) using LangChain's agent framework.\n",
+        "\n",
+        "Two options below:\n",
+        "- **Option A** — Load tests from HuggingFace dataset \n",
+        "- **Option B** — Load tests from Agent-Diff server test suites \n",
+        "\n",
+        "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/agent-diff-bench/agent-diff/blob/main/examples/langchain_agent_benchmark.ipynb)\n",
+        "\n",
+        "**Links:** [Paper](https://arxiv.org/abs/2602.11224) | [Dataset](https://huggingface.co/datasets/hubertmarek/agent-diff-bench) | [GitHub](https://github.com/agent-diff-bench/agent-diff) | [Custom Evals Example](https://colab.research.google.com/drive/1cfeMQ2R_JpGRdagT0U-D8cngpsHJmJON?usp=sharing)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "!pip install agent-diff langchain langchain-openai datasets -q"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Get your API key at https://www.agentdiff.dev/dashboard\n",
+        "%env AGENT_DIFF_API_KEY=\n",
+        "%env AGENT_DIFF_BASE_URL=https://api.agentdiff.dev\n",
+        "# OpenRouter key (or any OpenAI-compatible provider) e.g. https://openrouter.ai/anthropic/claude-haiku-4.5\n",
+        "%env OPENAI_API_KEY="
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "import time\n",
+        "import json\n",
+        "from agent_diff import AgentDiff, PythonExecutorProxy, create_langchain_tool\n",
+        "from langchain.agents import create_agent\n",
+        "from langchain_openai import ChatOpenAI\n",
+        "\n",
+        "client = AgentDiff()\n",
+        "\n",
+        "model = ChatOpenAI(\n",
+        "    model=\"anthropic/claude-haiku-4.5\",\n",
+        "    base_url=\"https://openrouter.ai/api/v1\",\n",
+        ")\n",
+        "\n",
+        "SERVICE_PROMPTS = {\n",
+        "    \"slack\": \"Use execute_python to interact with Slack API at https://slack.com/api. Authentication is handled automatically via proxy. Leave a placeholder credential where you would add a real token.\",\n",
+        "    \"box\": \"Use execute_python to interact with Box API at https://api.box.com/2.0. Authentication is handled automatically via proxy. Leave a placeholder credential where you would add a real token.\",\n",
+        "    \"calendar\": \"Use execute_python to interact with Google Calendar API at https://www.googleapis.com/calendar/v3. Authentication is handled automatically via proxy. Leave a placeholder credential where you would add a real token. Current Date/Time: Sunday, June 17, 2018 at 00:01 (midnight), timezone America/Los_Angeles.\",\n",
+        "    \"linear\": \"Use execute_python to interact with Linear GraphQL API at https://api.linear.app/graphql. Authentication is handled automatically via proxy. Leave a placeholder credential where you would add a real token.\",\n",
+        "}"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Option A: Load from HuggingFace Dataset"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from datasets import load_dataset\n",
+        "\n",
+        "dataset = load_dataset(\"hubertmarek/agent-diff-bench\", split=\"test\")\n",
+        "results = []\n",
+        "\n",
+        "for example in dataset.select(range(5)):  # First 5 tasks; remove .select() for full benchmark\n",
+        "    info = json.loads(example[\"info\"]) if isinstance(example[\"info\"], str) else example[\"info\"]\n",
+        "    expected = json.loads(example[\"answer\"]) if isinstance(example[\"answer\"], str) else example[\"answer\"]\n",
+        "    service = info[\"service\"]\n",
+        "\n",
+        "    print(f\"Running: {example.get('test_name', example['test_id'])}\")\n",
+        "\n",
+        "    env = client.init_env(\n",
+        "        templateService=info[\"service\"],\n",
+        "        templateName=info[\"seed_template\"],\n",
+        "        impersonateUserId=info[\"impersonate_user_id\"],\n",
+        "    )\n",
+        "    run = client.start_run(envId=env.environmentId)\n",
+        "\n",
+        "    python_tool = create_langchain_tool(\n",
+        "        PythonExecutorProxy(env.environmentId, base_url=client.base_url, api_key=client.api_key)\n",
+        "    )\n",
+        "\n",
+        "    agent = create_agent(\n",
+        "        model=model,\n",
+        "        tools=[python_tool],\n",
+        "        system_prompt=SERVICE_PROMPTS[service],\n",
+        "    )\n",
+        "\n",
+        "    start = time.perf_counter()\n",
+        "    try:\n",
+        "        response = agent.invoke({\"messages\": [\n",
+        "            {\"role\": \"user\", \"content\": example[\"question\"]}\n",
+        "        ]})\n",
+        "    except Exception as e:\n",
+        "        response = {\"error\": str(e)}\n",
+        "    elapsed = time.perf_counter() - start\n",
+        "\n",
+        "    client.evaluate_run(runId=run.runId, expectedOutput=expected)\n",
+        "    result = client.get_results_for_run(runId=run.runId)\n",
+        "\n",
+        "    results.append({\n",
+        "        \"test_id\": example[\"test_id\"],\n",
+        "        \"service\": service,\n",
+        "        \"passed\": result.passed,\n",
+        "        \"score\": result.score,\n",
+        "        \"time\": round(elapsed, 1),\n",
+        "    })\n",
+        "    print(f\"  {'PASS' if result.passed else 'FAIL'} | score={result.score} | {elapsed:.1f}s\")\n",
+        "\n",
+        "    client.delete_env(envId=env.environmentId)\n",
+        "\n",
+        "passed = sum(1 for r in results if r[\"passed\"])\n",
+        "print(f\"\\nResults: {passed}/{len(results)} passed\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Option B: Load from Server Test Suites\n",
+        "\n",
+        "Uses the Agent-Diff platform's test suite API. Assertions are defined server-side so you don't need to pass `expectedOutput` — just call `evaluate_run`. Available test suites: [docs](https://agentdiff.mintlify.app/test-suites/benchmarks)."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "SUITES = [\"Slack Bench v2\", \"Box Bench v2\", \"Calendar Bench\", \"Linear Bench\"]\n",
+        "\n",
+        "results = []\n",
+        "\n",
+        "for suite_name in SUITES:\n",
+        "    suite_list = client.list_test_suites(name=suite_name)\n",
+        "    if not suite_list.testSuites:\n",
+        "        print(f\"[SKIP] '{suite_name}' not found\")\n",
+        "        continue\n",
+        "    suite = client.get_test_suite(suite_list.testSuites[0].id, expand=True)\n",
+        "    tests = suite.tests[:5]  # First 5 tests per suite; remove [:5] for full benchmark\n",
+        "\n",
+        "    print(f\"\\n{'='*50}\")\n",
+        "    print(f\"  {suite_name} — {len(tests)} tests\")\n",
+        "    print(f\"{'='*50}\")\n",
+        "\n",
+        "    for test in tests:\n",
+        "        env = client.init_env(testId=test.id)\n",
+        "        run = client.start_run(envId=env.environmentId, testId=test.id)\n",
+        "\n",
+        "        python_tool = create_langchain_tool(\n",
+        "            PythonExecutorProxy(env.environmentId, base_url=client.base_url, api_key=client.api_key)\n",
+        "        )\n",
+        "\n",
+        "        service = env.service\n",
+        "        agent = create_agent(\n",
+        "            model=model,\n",
+        "            tools=[python_tool],\n",
+        "            system_prompt=SERVICE_PROMPTS.get(service, SERVICE_PROMPTS[\"slack\"]),\n",
+        "        )\n",
+        "\n",
+        "        start = time.perf_counter()\n",
+        "        try:\n",
+        "            response = agent.invoke({\"messages\": [\n",
+        "                {\"role\": \"user\", \"content\": test.prompt}\n",
+        "            ]})\n",
+        "        except Exception as e:\n",
+        "            response = {\"error\": str(e)}\n",
+        "        elapsed = time.perf_counter() - start\n",
+        "\n",
+        "        client.evaluate_run(runId=run.runId)\n",
+        "        result = client.get_results_for_run(runId=run.runId)\n",
+        "\n",
+        "        results.append({\n",
+        "            \"test_id\": str(test.id),\n",
+        "            \"suite\": suite_name,\n",
+        "            \"passed\": result.passed,\n",
+        "            \"score\": result.score,\n",
+        "            \"time\": round(elapsed, 1),\n",
+        "        })\n",
+        "        status = \"PASS\" if result.passed else \"FAIL\"\n",
+        "        print(f\"  [{status}] {getattr(test, 'name', str(test.id))[:60]}  score={result.score} | {elapsed:.1f}s\")\n",
+        "\n",
+        "        client.delete_env(envId=env.environmentId)\n",
+        "\n",
+        "passed = sum(1 for r in results if r[\"passed\"])\n",
+        "print(f\"\\nResults: {passed}/{len(results)} passed\")"
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python",
+      "version": "3.11.0"
+    }
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "!pip install agent-diff langchain langchain-openai datasets -q"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Get your API key at https://www.agentdiff.dev/dashboard\n",
-    "%env AGENT_DIFF_API_KEY=\n",
-    "%env AGENT_DIFF_BASE_URL=https://api.agentdiff.dev\n",
-    "# OpenRouter key (or any OpenAI-compatible provider) e.g. https://openrouter.ai/anthropic/claude-haiku-4.5\n",
-    "%env OPENAI_API_KEY="
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import time\n",
-    "import json\n",
-    "from agent_diff import AgentDiff, PythonExecutorProxy, create_langchain_tool\n",
-    "from langchain.agents import create_agent\n",
-    "from langchain_openai import ChatOpenAI\n",
-    "\n",
-    "client = AgentDiff()\n",
-    "\n",
-    "model = ChatOpenAI(\n",
-    "    model=\"anthropic/claude-haiku-4.5\",\n",
-    "    base_url=\"https://openrouter.ai/api/v1\",\n",
-    ")\n",
-    "\n",
-    "SERVICE_PROMPTS = {\n",
-    "    \"slack\": \"Use execute_python to interact with Slack API at https://slack.com/api. Authentication is handled automatically via proxy. Leave a placeholder credential where you would add a real token.\",\n",
-    "    \"box\": \"Use execute_python to interact with Box API at https://api.box.com/2.0. Authentication is handled automatically via proxy. Leave a placeholder credential where you would add a real token.\",\n",
-    "    \"calendar\": \"Use execute_python to interact with Google Calendar API at https://www.googleapis.com/calendar/v3. Authentication is handled automatically via proxy. Leave a placeholder credential where you would add a real token. Current Date/Time: Sunday, June 17, 2018 at 00:01 (midnight), timezone America/Los_Angeles.\",\n",
-    "    \"linear\": \"Use execute_python to interact with Linear GraphQL API at https://api.linear.app/graphql. Authentication is handled automatically via proxy. Leave a placeholder credential where you would add a real token.\",\n",
-    "}"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Option A: Load from HuggingFace Dataset"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from datasets import load_dataset\n",
-    "\n",
-    "dataset = load_dataset(\"hubertmarek/agent-diff-bench\", split=\"test\")\n",
-    "results = []\n",
-    "\n",
-    "for example in dataset.select(range(5)):  # First 5 tasks; remove .select() for full benchmark\n",
-    "    info = json.loads(example[\"info\"]) if isinstance(example[\"info\"], str) else example[\"info\"]\n",
-    "    expected = json.loads(example[\"answer\"]) if isinstance(example[\"answer\"], str) else example[\"answer\"]\n",
-    "    service = info[\"service\"]\n",
-    "\n",
-    "    print(f\"Running: {example.get('test_name', example['test_id'])}\")\n",
-    "\n",
-    "    env = client.init_env(\n",
-    "        templateService=info[\"service\"],\n",
-    "        templateName=info[\"seed_template\"],\n",
-    "        impersonateUserId=info[\"impersonate_user_id\"],\n",
-    "    )\n",
-    "    run = client.start_run(envId=env.environmentId)\n",
-    "\n",
-    "    python_tool = create_langchain_tool(\n",
-    "        PythonExecutorProxy(env.environmentId, base_url=client.base_url, api_key=client.api_key)\n",
-    "    )\n",
-    "\n",
-    "    agent = create_agent(\n",
-    "        model=model,\n",
-    "        tools=[python_tool],\n",
-    "        system_prompt=SERVICE_PROMPTS[service],\n",
-    "    )\n",
-    "\n",
-    "    start = time.perf_counter()\n",
-    "    try:\n",
-    "        response = agent.invoke({\"messages\": [\n",
-    "            {\"role\": \"user\", \"content\": example[\"question\"]}\n",
-    "        ]})\n",
-    "    except Exception as e:\n",
-    "        response = {\"error\": str(e)}\n",
-    "    elapsed = time.perf_counter() - start\n",
-    "\n",
-    "    client.evaluate_run(runId=run.runId, expectedOutput=expected)\n",
-    "    result = client.get_results_for_run(runId=run.runId)\n",
-    "\n",
-    "    results.append({\n",
-    "        \"test_id\": example[\"test_id\"],\n",
-    "        \"service\": service,\n",
-    "        \"passed\": result.passed,\n",
-    "        \"score\": result.score,\n",
-    "        \"time\": round(elapsed, 1),\n",
-    "    })\n",
-    "    print(f\"  {'PASS' if result.passed else 'FAIL'} | score={result.score} | {elapsed:.1f}s\")\n",
-    "\n",
-    "    client.delete_env(envId=env.environmentId)\n",
-    "\n",
-    "passed = sum(1 for r in results if r[\"passed\"])\n",
-    "print(f\"\\nResults: {passed}/{len(results)} passed\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Option B: Load from Server Test Suites\n",
-    "\n",
-    "Uses the Agent-Diff platform's test suite API. Assertions are defined server-side so you don't need to pass `expectedOutput` — just call `evaluate_run`. Available test suites: [docs](https://agentdiff.mintlify.app/test-suites/benchmarks)."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "SUITES = [\"Slack Bench v2\", \"Box Bench v2\", \"Calendar Bench\", \"Linear Bench\"]\n",
-    "\n",
-    "results = []\n",
-    "\n",
-    "for suite_name in SUITES:\n",
-    "    suite_list = client.list_test_suites(name=suite_name)\n",
-    "    if not suite_list.testSuites:\n",
-    "        print(f\"[SKIP] '{suite_name}' not found\")\n",
-    "        continue\n",
-    "    suite = client.get_test_suite(suite_list.testSuites[0].id, expand=True)\n",
-    "    tests = suite.tests[:5]  # First 5 tests per suite; remove [:5] for full benchmark\n",
-    "\n",
-    "    print(f\"\\n{'='*50}\")\n",
-    "    print(f\"  {suite_name} — {len(tests)} tests\")\n",
-    "    print(f\"{'='*50}\")\n",
-    "\n",
-    "    for test in tests:\n",
-    "        env = client.init_env(testId=test.id)\n",
-    "        run = client.start_run(envId=env.environmentId, testId=test.id)\n",
-    "\n",
-    "        python_tool = create_langchain_tool(\n",
-    "            PythonExecutorProxy(env.environmentId, base_url=client.base_url, api_key=client.api_key)\n",
-    "        )\n",
-    "\n",
-    "        service = env.service\n",
-    "        agent = create_agent(\n",
-    "            model=model,\n",
-    "            tools=[python_tool],\n",
-    "            system_prompt=SERVICE_PROMPTS.get(service, SERVICE_PROMPTS[\"slack\"]),\n",
-    "        )\n",
-    "\n",
-    "        start = time.perf_counter()\n",
-    "        try:\n",
-    "            response = agent.invoke({\"messages\": [\n",
-    "                {\"role\": \"user\", \"content\": test.prompt}\n",
-    "            ]})\n",
-    "        except Exception as e:\n",
-    "            response = {\"error\": str(e)}\n",
-    "        elapsed = time.perf_counter() - start\n",
-    "\n",
-    "        client.evaluate_run(runId=run.runId)\n",
-    "        result = client.get_results_for_run(runId=run.runId)\n",
-    "\n",
-    "        results.append({\n",
-    "            \"test_id\": str(test.id),\n",
-    "            \"suite\": suite_name,\n",
-    "            \"passed\": result.passed,\n",
-    "            \"score\": result.score,\n",
-    "            \"time\": round(elapsed, 1),\n",
-    "        })\n",
-    "        status = \"PASS\" if result.passed else \"FAIL\"\n",
-    "        print(f\"  [{status}] {getattr(test, 'name', str(test.id))[:60]}  score={result.score} | {elapsed:.1f}s\")\n",
-    "\n",
-    "        client.delete_env(envId=env.environmentId)\n",
-    "\n",
-    "passed = sum(1 for r in results if r[\"passed\"])\n",
-    "print(f\"\\nResults: {passed}/{len(results)} passed\")"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "name": "python",
-   "version": "3.11.0"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
+  "nbformat": 4,
+  "nbformat_minor": 4
 }