From d5cb321cd0c222f2db3d9c16d6314f721c5bbefc Mon Sep 17 00:00:00 2001 From: hubertpysklo Date: Thu, 5 Mar 2026 21:06:34 +0530 Subject: [PATCH] Example --- examples/langchain_agent_benchmark.ipynb | 448 +++++++++++------------ 1 file changed, 223 insertions(+), 225 deletions(-) diff --git a/examples/langchain_agent_benchmark.ipynb b/examples/langchain_agent_benchmark.ipynb index ceb1857..e217d02 100644 --- a/examples/langchain_agent_benchmark.ipynb +++ b/examples/langchain_agent_benchmark.ipynb @@ -1,228 +1,226 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Agent-Diff Benchmark: LangChain Agent\n", - "\n", - "Run the [Agent-Diff benchmark](https://arxiv.org/abs/2602.11224) using LangChain's built-in agent with tool calling.\n", - "\n", - "Unlike the [ReAct notebook](react_agent_benchmark.ipynb) which uses a custom XML-tag loop, this notebook lets LangChain handle the agent loop via the model's native function-calling protocol.\n", - "\n", - "Two options are shown:\n", - "- **Option A** — Load tests from HuggingFace dataset (no server-side test suites needed)\n", - "- **Option B** — Load tests from Agent-Diff server test suites (used in production evaluations)\n", - "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/agent-diff-bench/agent-diff/blob/main/examples/langchain_agent_benchmark.ipynb)\n", - "\n", - "**Links:** [Paper](https://arxiv.org/abs/2602.11224) | [Dataset](https://huggingface.co/datasets/hubertmarek/agent-diff-bench) | [GitHub](https://github.com/agent-diff-bench/agent-diff)" - ] + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Agent-Diff Benchmark: LangChain Agent\n", + "\n", + "Run the [Agent-Diff benchmark](https://arxiv.org/abs/2602.11224) using LangChain's agent framework.\n", + "\n", + "Two options below:\n", + "- **Option A** — Load tests from HuggingFace dataset \n", + "- **Option B** — Load tests from Agent-Diff server test suites \n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/agent-diff-bench/agent-diff/blob/main/examples/langchain_agent_benchmark.ipynb)\n", + "\n", + "**Links:** [Paper](https://arxiv.org/abs/2602.11224) | [Dataset](https://huggingface.co/datasets/hubertmarek/agent-diff-bench) | [GitHub](https://github.com/agent-diff-bench/agent-diff) | [Custom Evals Example](https://colab.research.google.com/drive/1cfeMQ2R_JpGRdagT0U-D8cngpsHJmJON?usp=sharing)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install agent-diff langchain langchain-openai datasets -q" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Get your API key at https://www.agentdiff.dev/dashboard\n", + "%env AGENT_DIFF_API_KEY=\n", + "%env AGENT_DIFF_BASE_URL=https://api.agentdiff.dev\n", + "# OpenRouter key (or any OpenAI-compatible provider) e.g. https://openrouter.ai/anthropic/claude-haiku-4.5\n", + "%env OPENAI_API_KEY=" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import time\n", + "import json\n", + "from agent_diff import AgentDiff, PythonExecutorProxy, create_langchain_tool\n", + "from langchain.agents import create_agent\n", + "from langchain_openai import ChatOpenAI\n", + "\n", + "client = AgentDiff()\n", + "\n", + "model = ChatOpenAI(\n", + " model=\"anthropic/claude-haiku-4.5\",\n", + " base_url=\"https://openrouter.ai/api/v1\",\n", + ")\n", + "\n", + "SERVICE_PROMPTS = {\n", + " \"slack\": \"Use execute_python to interact with Slack API at https://slack.com/api. Authentication is handled automatically via proxy. Leave a placeholder credential where you would add a real token.\",\n", + " \"box\": \"Use execute_python to interact with Box API at https://api.box.com/2.0. Authentication is handled automatically via proxy. Leave a placeholder credential where you would add a real token.\",\n", + " \"calendar\": \"Use execute_python to interact with Google Calendar API at https://www.googleapis.com/calendar/v3. Authentication is handled automatically via proxy. Leave a placeholder credential where you would add a real token. Current Date/Time: Sunday, June 17, 2018 at 00:01 (midnight), timezone America/Los_Angeles.\",\n", + " \"linear\": \"Use execute_python to interact with Linear GraphQL API at https://api.linear.app/graphql. Authentication is handled automatically via proxy. Leave a placeholder credential where you would add a real token.\",\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Option A: Load from HuggingFace Dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from datasets import load_dataset\n", + "\n", + "dataset = load_dataset(\"hubertmarek/agent-diff-bench\", split=\"test\")\n", + "results = []\n", + "\n", + "for example in dataset.select(range(5)): # First 5 tasks; remove .select() for full benchmark\n", + " info = json.loads(example[\"info\"]) if isinstance(example[\"info\"], str) else example[\"info\"]\n", + " expected = json.loads(example[\"answer\"]) if isinstance(example[\"answer\"], str) else example[\"answer\"]\n", + " service = info[\"service\"]\n", + "\n", + " print(f\"Running: {example.get('test_name', example['test_id'])}\")\n", + "\n", + " env = client.init_env(\n", + " templateService=info[\"service\"],\n", + " templateName=info[\"seed_template\"],\n", + " impersonateUserId=info[\"impersonate_user_id\"],\n", + " )\n", + " run = client.start_run(envId=env.environmentId)\n", + "\n", + " python_tool = create_langchain_tool(\n", + " PythonExecutorProxy(env.environmentId, base_url=client.base_url, api_key=client.api_key)\n", + " )\n", + "\n", + " agent = create_agent(\n", + " model=model,\n", + " tools=[python_tool],\n", + " system_prompt=SERVICE_PROMPTS[service],\n", + " )\n", + "\n", + " start = time.perf_counter()\n", + " try:\n", + " response = agent.invoke({\"messages\": [\n", + " {\"role\": \"user\", \"content\": example[\"question\"]}\n", + " ]})\n", + " except Exception as e:\n", + " response = {\"error\": str(e)}\n", + " elapsed = time.perf_counter() - start\n", + "\n", + " client.evaluate_run(runId=run.runId, expectedOutput=expected)\n", + " result = client.get_results_for_run(runId=run.runId)\n", + "\n", + " results.append({\n", + " \"test_id\": example[\"test_id\"],\n", + " \"service\": service,\n", + " \"passed\": result.passed,\n", + " \"score\": result.score,\n", + " \"time\": round(elapsed, 1),\n", + " })\n", + " print(f\" {'PASS' if result.passed else 'FAIL'} | score={result.score} | {elapsed:.1f}s\")\n", + "\n", + " client.delete_env(envId=env.environmentId)\n", + "\n", + "passed = sum(1 for r in results if r[\"passed\"])\n", + "print(f\"\\nResults: {passed}/{len(results)} passed\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Option B: Load from Server Test Suites\n", + "\n", + "Uses the Agent-Diff platform's test suite API. Assertions are defined server-side so you don't need to pass `expectedOutput` — just call `evaluate_run`. Available test suites: [docs](https://agentdiff.mintlify.app/test-suites/benchmarks)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "SUITES = [\"Slack Bench v2\", \"Box Bench v2\", \"Calendar Bench\", \"Linear Bench\"]\n", + "\n", + "results = []\n", + "\n", + "for suite_name in SUITES:\n", + " suite_list = client.list_test_suites(name=suite_name)\n", + " if not suite_list.testSuites:\n", + " print(f\"[SKIP] '{suite_name}' not found\")\n", + " continue\n", + " suite = client.get_test_suite(suite_list.testSuites[0].id, expand=True)\n", + " tests = suite.tests[:5] # First 5 tests per suite; remove [:5] for full benchmark\n", + "\n", + " print(f\"\\n{'='*50}\")\n", + " print(f\" {suite_name} — {len(tests)} tests\")\n", + " print(f\"{'='*50}\")\n", + "\n", + " for test in tests:\n", + " env = client.init_env(testId=test.id)\n", + " run = client.start_run(envId=env.environmentId, testId=test.id)\n", + "\n", + " python_tool = create_langchain_tool(\n", + " PythonExecutorProxy(env.environmentId, base_url=client.base_url, api_key=client.api_key)\n", + " )\n", + "\n", + " service = env.service\n", + " agent = create_agent(\n", + " model=model,\n", + " tools=[python_tool],\n", + " system_prompt=SERVICE_PROMPTS.get(service, SERVICE_PROMPTS[\"slack\"]),\n", + " )\n", + "\n", + " start = time.perf_counter()\n", + " try:\n", + " response = agent.invoke({\"messages\": [\n", + " {\"role\": \"user\", \"content\": test.prompt}\n", + " ]})\n", + " except Exception as e:\n", + " response = {\"error\": str(e)}\n", + " elapsed = time.perf_counter() - start\n", + "\n", + " client.evaluate_run(runId=run.runId)\n", + " result = client.get_results_for_run(runId=run.runId)\n", + "\n", + " results.append({\n", + " \"test_id\": str(test.id),\n", + " \"suite\": suite_name,\n", + " \"passed\": result.passed,\n", + " \"score\": result.score,\n", + " \"time\": round(elapsed, 1),\n", + " })\n", + " status = \"PASS\" if result.passed else \"FAIL\"\n", + " print(f\" [{status}] {getattr(test, 'name', str(test.id))[:60]} score={result.score} | {elapsed:.1f}s\")\n", + "\n", + " client.delete_env(envId=env.environmentId)\n", + "\n", + "passed = sum(1 for r in results if r[\"passed\"])\n", + "print(f\"\\nResults: {passed}/{len(results)} passed\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.11.0" + } }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!pip install agent-diff langchain langchain-openai datasets -q" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Get your API key at https://www.agentdiff.dev/dashboard\n", - "%env AGENT_DIFF_API_KEY=\n", - "%env AGENT_DIFF_BASE_URL=https://api.agentdiff.dev\n", - "# OpenRouter key (or any OpenAI-compatible provider) e.g. https://openrouter.ai/anthropic/claude-haiku-4.5\n", - "%env OPENAI_API_KEY=" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import time\n", - "import json\n", - "from agent_diff import AgentDiff, PythonExecutorProxy, create_langchain_tool\n", - "from langchain.agents import create_agent\n", - "from langchain_openai import ChatOpenAI\n", - "\n", - "client = AgentDiff()\n", - "\n", - "model = ChatOpenAI(\n", - " model=\"anthropic/claude-haiku-4.5\",\n", - " base_url=\"https://openrouter.ai/api/v1\",\n", - ")\n", - "\n", - "SERVICE_PROMPTS = {\n", - " \"slack\": \"Use execute_python to interact with Slack API at https://slack.com/api. Authentication is handled automatically via proxy. Leave a placeholder credential where you would add a real token.\",\n", - " \"box\": \"Use execute_python to interact with Box API at https://api.box.com/2.0. Authentication is handled automatically via proxy. Leave a placeholder credential where you would add a real token.\",\n", - " \"calendar\": \"Use execute_python to interact with Google Calendar API at https://www.googleapis.com/calendar/v3. Authentication is handled automatically via proxy. Leave a placeholder credential where you would add a real token. Current Date/Time: Sunday, June 17, 2018 at 00:01 (midnight), timezone America/Los_Angeles.\",\n", - " \"linear\": \"Use execute_python to interact with Linear GraphQL API at https://api.linear.app/graphql. Authentication is handled automatically via proxy. Leave a placeholder credential where you would add a real token.\",\n", - "}" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Option A: Load from HuggingFace Dataset" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from datasets import load_dataset\n", - "\n", - "dataset = load_dataset(\"hubertmarek/agent-diff-bench\", split=\"test\")\n", - "results = []\n", - "\n", - "for example in dataset.select(range(5)): # First 5 tasks; remove .select() for full benchmark\n", - " info = json.loads(example[\"info\"]) if isinstance(example[\"info\"], str) else example[\"info\"]\n", - " expected = json.loads(example[\"answer\"]) if isinstance(example[\"answer\"], str) else example[\"answer\"]\n", - " service = info[\"service\"]\n", - "\n", - " print(f\"Running: {example.get('test_name', example['test_id'])}\")\n", - "\n", - " env = client.init_env(\n", - " templateService=info[\"service\"],\n", - " templateName=info[\"seed_template\"],\n", - " impersonateUserId=info[\"impersonate_user_id\"],\n", - " )\n", - " run = client.start_run(envId=env.environmentId)\n", - "\n", - " python_tool = create_langchain_tool(\n", - " PythonExecutorProxy(env.environmentId, base_url=client.base_url, api_key=client.api_key)\n", - " )\n", - "\n", - " agent = create_agent(\n", - " model=model,\n", - " tools=[python_tool],\n", - " system_prompt=SERVICE_PROMPTS[service],\n", - " )\n", - "\n", - " start = time.perf_counter()\n", - " try:\n", - " response = agent.invoke({\"messages\": [\n", - " {\"role\": \"user\", \"content\": example[\"question\"]}\n", - " ]})\n", - " except Exception as e:\n", - " response = {\"error\": str(e)}\n", - " elapsed = time.perf_counter() - start\n", - "\n", - " client.evaluate_run(runId=run.runId, expectedOutput=expected)\n", - " result = client.get_results_for_run(runId=run.runId)\n", - "\n", - " results.append({\n", - " \"test_id\": example[\"test_id\"],\n", - " \"service\": service,\n", - " \"passed\": result.passed,\n", - " \"score\": result.score,\n", - " \"time\": round(elapsed, 1),\n", - " })\n", - " print(f\" {'PASS' if result.passed else 'FAIL'} | score={result.score} | {elapsed:.1f}s\")\n", - "\n", - " client.delete_env(envId=env.environmentId)\n", - "\n", - "passed = sum(1 for r in results if r[\"passed\"])\n", - "print(f\"\\nResults: {passed}/{len(results)} passed\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Option B: Load from Server Test Suites\n", - "\n", - "Uses the Agent-Diff platform's test suite API. Assertions are defined server-side so you don't need to pass `expectedOutput` — just call `evaluate_run`. Available test suites: [docs](https://agentdiff.mintlify.app/test-suites/benchmarks)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "SUITES = [\"Slack Bench v2\", \"Box Bench v2\", \"Calendar Bench\", \"Linear Bench\"]\n", - "\n", - "results = []\n", - "\n", - "for suite_name in SUITES:\n", - " suite_list = client.list_test_suites(name=suite_name)\n", - " if not suite_list.testSuites:\n", - " print(f\"[SKIP] '{suite_name}' not found\")\n", - " continue\n", - " suite = client.get_test_suite(suite_list.testSuites[0].id, expand=True)\n", - " tests = suite.tests[:5] # First 5 tests per suite; remove [:5] for full benchmark\n", - "\n", - " print(f\"\\n{'='*50}\")\n", - " print(f\" {suite_name} — {len(tests)} tests\")\n", - " print(f\"{'='*50}\")\n", - "\n", - " for test in tests:\n", - " env = client.init_env(testId=test.id)\n", - " run = client.start_run(envId=env.environmentId, testId=test.id)\n", - "\n", - " python_tool = create_langchain_tool(\n", - " PythonExecutorProxy(env.environmentId, base_url=client.base_url, api_key=client.api_key)\n", - " )\n", - "\n", - " service = env.service\n", - " agent = create_agent(\n", - " model=model,\n", - " tools=[python_tool],\n", - " system_prompt=SERVICE_PROMPTS.get(service, SERVICE_PROMPTS[\"slack\"]),\n", - " )\n", - "\n", - " start = time.perf_counter()\n", - " try:\n", - " response = agent.invoke({\"messages\": [\n", - " {\"role\": \"user\", \"content\": test.prompt}\n", - " ]})\n", - " except Exception as e:\n", - " response = {\"error\": str(e)}\n", - " elapsed = time.perf_counter() - start\n", - "\n", - " client.evaluate_run(runId=run.runId)\n", - " result = client.get_results_for_run(runId=run.runId)\n", - "\n", - " results.append({\n", - " \"test_id\": str(test.id),\n", - " \"suite\": suite_name,\n", - " \"passed\": result.passed,\n", - " \"score\": result.score,\n", - " \"time\": round(elapsed, 1),\n", - " })\n", - " status = \"PASS\" if result.passed else \"FAIL\"\n", - " print(f\" [{status}] {getattr(test, 'name', str(test.id))[:60]} score={result.score} | {elapsed:.1f}s\")\n", - "\n", - " client.delete_env(envId=env.environmentId)\n", - "\n", - "passed = sum(1 for r in results if r[\"passed\"])\n", - "print(f\"\\nResults: {passed}/{len(results)} passed\")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "name": "python", - "version": "3.11.0" - } - }, - "nbformat": 4, - "nbformat_minor": 4 + "nbformat": 4, + "nbformat_minor": 4 }