diff --git a/.agentv/targets.yaml b/.agentv/targets.yaml index f98d402d6..bc7b85d21 100644 --- a/.agentv/targets.yaml +++ b/.agentv/targets.yaml @@ -10,11 +10,21 @@ targets: system_prompt: "Answer directly based on the information provided." grader_target: gemini-flash + - name: pi-cli + provider: pi-cli + grader_target: gemini-flash + + - name: pi-coding-agent + provider: pi-coding-agent + subprovider: openrouter + model: z-ai/glm-4.7 + api_key: ${{ OPENROUTER_API_KEY }} + system_prompt: "Answer directly based on the information provided." + grader_target: gemini-flash + - name: codex provider: codex grader_target: gemini-llm - cwd: ${{ CODEX_WORKSPACE_DIR }} # Where scratch workspaces are created - log_dir: ${{ CODEX_LOG_DIR }} # Optional: where Codex CLI stream logs are stored (defaults to ./.agentv/logs/codex) log_format: json # Optional: 'summary' (default) or 'json' for raw event logs - name: gemini-llm diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json index 7952bbb4d..ce5e1f5c4 100644 --- a/.claude-plugin/marketplace.json +++ b/.claude-plugin/marketplace.json @@ -16,6 +16,11 @@ "name": "agentv-claude-trace", "description": "Session tracing plugin — exports Claude Code session traces via OpenTelemetry", "source": "./plugins/agentv-claude-trace" + }, + { + "name": "agentic-architect", + "description": "Design and review AI agent architectures — species selection, workflow patterns, and plugin quality review", + "source": "./plugins/agentic-architect" } ] } diff --git a/.claude/settings.json b/.claude/settings.json deleted file mode 100644 index 5cfa58548..000000000 --- a/.claude/settings.json +++ /dev/null @@ -1,84 +0,0 @@ -{ - "hooks": { - "PostToolUse": [ - { - "matcher": "Task", - "hooks": [ - { - "type": "command", - "command": "entire hooks claude-code post-task" - } - ] - }, - { - "matcher": "TodoWrite", - "hooks": [ - { - "type": "command", - "command": "entire hooks claude-code post-todo" - } - ] - } - ], - "PreToolUse": [ - { - "matcher": "Task", - "hooks": [ - { - "type": "command", - "command": "entire hooks claude-code pre-task" - } - ] - } - ], - "SessionEnd": [ - { - "matcher": "", - "hooks": [ - { - "type": "command", - "command": "entire hooks claude-code session-end" - } - ] - } - ], - "SessionStart": [ - { - "matcher": "", - "hooks": [ - { - "type": "command", - "command": "entire hooks claude-code session-start" - } - ] - } - ], - "Stop": [ - { - "matcher": "", - "hooks": [ - { - "type": "command", - "command": "entire hooks claude-code stop" - } - ] - } - ], - "UserPromptSubmit": [ - { - "matcher": "", - "hooks": [ - { - "type": "command", - "command": "entire hooks claude-code user-prompt-submit" - } - ] - } - ] - }, - "permissions": { - "deny": [ - "Read(./.entire/metadata/**)" - ] - } -} diff --git a/.entire/.gitignore b/.entire/.gitignore deleted file mode 100644 index 2cffdefad..000000000 --- a/.entire/.gitignore +++ /dev/null @@ -1,4 +0,0 @@ -tmp/ -settings.local.json -metadata/ -logs/ diff --git a/.entire/settings.json b/.entire/settings.json deleted file mode 100644 index 1e4f9e0d9..000000000 --- a/.entire/settings.json +++ /dev/null @@ -1,4 +0,0 @@ -{ - "enabled": true, - "telemetry": false -} diff --git a/.github/hooks/entire.json b/.github/hooks/entire.json deleted file mode 100644 index f21b0abc5..000000000 --- a/.github/hooks/entire.json +++ /dev/null @@ -1,61 +0,0 @@ -{ - "hooks": { - "agentStop": [ - { - "type": "command", - "bash": "entire hooks copilot-cli agent-stop", - "comment": "Entire CLI" - } - ], - "errorOccurred": [ - { - "type": "command", - "bash": "entire hooks copilot-cli error-occurred", - "comment": "Entire CLI" - } - ], - "postToolUse": [ - { - "type": "command", - "bash": "entire hooks copilot-cli post-tool-use", - "comment": "Entire CLI" - } - ], - "preToolUse": [ - { - "type": "command", - "bash": "entire hooks copilot-cli pre-tool-use", - "comment": "Entire CLI" - } - ], - "sessionEnd": [ - { - "type": "command", - "bash": "entire hooks copilot-cli session-end", - "comment": "Entire CLI" - } - ], - "sessionStart": [ - { - "type": "command", - "bash": "entire hooks copilot-cli session-start", - "comment": "Entire CLI" - } - ], - "subagentStop": [ - { - "type": "command", - "bash": "entire hooks copilot-cli subagent-stop", - "comment": "Entire CLI" - } - ], - "userPromptSubmitted": [ - { - "type": "command", - "bash": "entire hooks copilot-cli user-prompt-submitted", - "comment": "Entire CLI" - } - ] - }, - "version": 1 -} diff --git a/.github/plugin/marketplace.json b/.github/plugin/marketplace.json index 7952bbb4d..ce5e1f5c4 100644 --- a/.github/plugin/marketplace.json +++ b/.github/plugin/marketplace.json @@ -16,6 +16,11 @@ "name": "agentv-claude-trace", "description": "Session tracing plugin — exports Claude Code session traces via OpenTelemetry", "source": "./plugins/agentv-claude-trace" + }, + { + "name": "agentic-architect", + "description": "Design and review AI agent architectures — species selection, workflow patterns, and plugin quality review", + "source": "./plugins/agentic-architect" } ] } diff --git a/.github/workflows/validate.yml b/.github/workflows/validate.yml new file mode 100644 index 000000000..815846321 --- /dev/null +++ b/.github/workflows/validate.yml @@ -0,0 +1,23 @@ +name: Validate + +on: + push: + branches: [main] + pull_request: + +jobs: + links: + name: Check Links + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Check relative markdown links + uses: lycheeverse/lychee-action@v2 + with: + args: >- + --offline + --no-progress + --glob-ignore-case + --root-dir . + "**/*.md" diff --git a/.opencode/plugins/entire.ts b/.opencode/plugins/entire.ts deleted file mode 100644 index 2e4d30a15..000000000 --- a/.opencode/plugins/entire.ts +++ /dev/null @@ -1,158 +0,0 @@ -// Entire CLI plugin for OpenCode -// Auto-generated by `entire enable --agent opencode` -// Do not edit manually — changes will be overwritten on next install. -// Requires Bun runtime (used by OpenCode's plugin system for loading ESM plugins). -import type { Plugin } from "@opencode-ai/plugin" - -export const EntirePlugin: Plugin = async ({ $, directory }) => { - const ENTIRE_CMD = "entire" - // Track seen user messages to fire turn-start only once per message - const seenUserMessages = new Set() - // Track current session ID for message events (which don't include sessionID) - let currentSessionID: string | null = null - // Track the model used by the most recent assistant message - let currentModel: string | null = null - // In-memory store for message metadata (role, tokens, etc.) - const messageStore = new Map() - - /** - * Pipe JSON payload to an entire hooks command (async). - * Errors are logged but never thrown — plugin failures must not crash OpenCode. - */ - async function callHook(hookName: string, payload: Record) { - try { - const json = JSON.stringify(payload) - await $`echo ${json} | ${ENTIRE_CMD} hooks opencode ${hookName}`.cwd(directory).quiet().nothrow() - } catch { - // Silently ignore — plugin failures must not crash OpenCode - } - } - - /** - * Synchronous variant for hooks that fire near process exit (turn-end, session-end). - * `opencode run` breaks its event loop on the same session.status idle event that - * triggers turn-end. The async callHook would be killed before completing. - * Bun.spawnSync blocks the event loop, preventing exit until the hook finishes. - */ - function callHookSync(hookName: string, payload: Record) { - try { - const json = JSON.stringify(payload) - Bun.spawnSync(["sh", "-c", `${ENTIRE_CMD} hooks opencode ${hookName}`], { - cwd: directory, - stdin: new TextEncoder().encode(json + "\n"), - stdout: "ignore", - stderr: "ignore", - }) - } catch { - // Silently ignore — plugin failures must not crash OpenCode - } - } - - return { - event: async ({ event }) => { - switch (event.type) { - case "session.created": { - const session = (event as any).properties?.info - if (!session?.id) break - // Reset per-session tracking state when switching sessions. - if (currentSessionID !== session.id) { - seenUserMessages.clear() - messageStore.clear() - currentModel = null - } - currentSessionID = session.id - await callHook("session-start", { - session_id: session.id, - }) - break - } - - case "message.updated": { - const msg = (event as any).properties?.info - if (!msg) break - // Store message metadata (role, time, tokens, etc.) - messageStore.set(msg.id, msg) - // Track model from assistant messages - if (msg.role === "assistant" && msg.modelID) { - currentModel = msg.modelID - } - break - } - - case "message.part.updated": { - const part = (event as any).properties?.part - if (!part?.messageID) break - - // Fire turn-start on the first text part of a new user message - const msg = messageStore.get(part.messageID) - if (msg?.role === "user" && part.type === "text" && !seenUserMessages.has(msg.id)) { - seenUserMessages.add(msg.id) - const sessionID = msg.sessionID ?? currentSessionID - if (sessionID) { - await callHook("turn-start", { - session_id: sessionID, - prompt: part.text ?? "", - model: currentModel ?? "", - }) - } - } - break - } - - case "session.status": { - // session.status fires in both TUI and non-interactive (run) mode. - // session.idle is deprecated and not reliably emitted in run mode. - const props = (event as any).properties - if (props?.status?.type !== "idle") break - const sessionID = props?.sessionID ?? currentSessionID - if (!sessionID) break - // Use sync variant: `opencode run` exits on the same idle event, - // so an async hook would be killed before completing. - callHookSync("turn-end", { - session_id: sessionID, - model: currentModel ?? "", - }) - break - } - - case "session.compacted": { - const sessionID = (event as any).properties?.sessionID - if (!sessionID) break - await callHook("compaction", { - session_id: sessionID, - }) - break - } - - case "session.deleted": { - const session = (event as any).properties?.info - if (!session?.id) break - seenUserMessages.clear() - messageStore.clear() - currentSessionID = null - // Use sync variant: session-end may fire during shutdown. - callHookSync("session-end", { - session_id: session.id, - }) - break - } - - case "server.instance.disposed": { - // Fires when OpenCode shuts down (TUI close or `opencode run` exit). - // session.deleted only fires on explicit user deletion, not on quit, - // so this is the only reliable way to end sessions on exit. - if (!currentSessionID) break - const sessionID = currentSessionID - seenUserMessages.clear() - messageStore.clear() - currentSessionID = null - // Use sync variant: this is the last event before process exit. - callHookSync("session-end", { - session_id: sessionID, - }) - break - } - } - }, - } -} diff --git a/apps/cli/README.md b/apps/cli/README.md deleted file mode 100644 index bb10c1270..000000000 --- a/apps/cli/README.md +++ /dev/null @@ -1,603 +0,0 @@ -# AgentV - -**CLI-first AI agent evaluation. No server. No signup. No overhead.** - -AgentV evaluates your agents locally with multi-objective scoring (correctness, latency, cost, safety) from YAML specifications. Deterministic code graders + customizable LLM graders, all version-controlled in Git. - -## Installation - -### All Agents Plugin Manager - -**1. Add AgentV marketplace source:** -```bash -npx allagents plugin marketplace add EntityProcess/agentv -``` - -**2. Ask Claude to set up AgentV in your current repository** -Example prompt: -```text -Set up AgentV in this repo. -``` - -The `agentv-onboarding` skill bootstraps setup automatically: -- verifies `agentv` CLI availability -- installs the CLI if needed -- runs `agentv init` -- verifies setup artifacts - -### CLI-Only Setup (Fallback) - -If you are not using Claude plugins, use the CLI directly. - -**1. Install:** -```bash -bun install -g agentv -``` - -Or with npm: -```bash -npm install -g agentv -``` - -**2. Initialize your workspace:** -```bash -agentv init -``` - -**3. Configure environment variables:** -- The init command creates a `.env.example` file in your project root -- Copy `.env.example` to `.env` and fill in your API keys, endpoints, and other configuration values -- Update the environment variable names in `.agentv/targets.yaml` to match those defined in your `.env` file - -**4. Create an eval** (`./evals/example.yaml`): -```yaml -description: Math problem solving evaluation -execution: - target: default - -tests: - - id: addition - criteria: Correctly calculates 15 + 27 = 42 - - input: What is 15 + 27? - - expected_output: "42" - - assertions: - - name: math_check - type: code-grader - command: ./validators/check_math.py -``` - -**5. Run the eval:** -```bash -agentv eval ./evals/example.yaml -``` - -Results appear in `.agentv/results/eval_.jsonl` with scores, reasoning, and execution traces. - -Learn more in the [examples/](examples/README.md) directory. For a detailed comparison with other frameworks, see [docs/COMPARISON.md](docs/COMPARISON.md). - -## Why AgentV? - -| Feature | AgentV | [LangWatch](https://github.com/langwatch/langwatch) | [LangSmith](https://github.com/langchain-ai/langsmith-sdk) | [LangFuse](https://github.com/langfuse/langfuse) | -|---------|--------|-----------|-----------|----------| -| **Setup** | `bun install -g agentv` | Cloud account + API key | Cloud account + API key | Cloud account + API key | -| **Server** | None (local) | Managed cloud | Managed cloud | Managed cloud | -| **Privacy** | All local | Cloud-hosted | Cloud-hosted | Cloud-hosted | -| **CLI-first** | ✓ | ✗ | Limited | Limited | -| **CI/CD ready** | ✓ | Requires API calls | Requires API calls | Requires API calls | -| **Version control** | ✓ (YAML in Git) | ✗ | ✗ | ✗ | -| **Evaluators** | Code + LLM + Custom | LLM only | LLM + Code | LLM only | - -**Best for:** Developers who want evaluation in their workflow, not a separate dashboard. Teams prioritizing privacy and reproducibility. - -## Features - -- **Multi-objective scoring**: Correctness, latency, cost, safety in one run -- **Multiple evaluator types**: Code validators, LLM graders, custom Python/TypeScript -- **Built-in targets**: VS Code Copilot, Codex CLI, Pi Coding Agent, Azure OpenAI, local CLI agents -- **Structured evaluation**: Rubric-based grading with weights and requirements -- **Batch evaluation**: Run hundreds of test cases in parallel -- **Export**: JSON, JSONL, YAML formats -- **Compare results**: Compute deltas between evaluation runs for A/B testing - -## Development - -Contributing to AgentV? Clone and set up the repository: - -```bash -git clone https://github.com/EntityProcess/agentv.git -cd agentv - -# Install Bun if you don't have it -curl -fsSL https://bun.sh/install | bash - -# Install dependencies and build -bun install && bun run build - -# Run tests -bun test -``` - -See [AGENTS.md](AGENTS.md) for development guidelines and design principles. - -### Releasing - -Version bump: - -```bash -bun run release # patch bump -bun run release minor -bun run release major -``` - -Canary rollout (recommended): - -```bash -bun run publish:next # publish current version to npm `next` -bun run promote:latest # promote same version to npm `latest` -bun run tag:next 2.18.0 # point npm `next` to an explicit version -bun run promote:latest 2.18.0 # point npm `latest` to an explicit version -``` - -Legacy prerelease flow (still available): - -```bash -bun run release:next # bump/increment `-next.N` -bun run release:next major # start new major prerelease line -``` - -## Core Concepts - -**Evaluation files** (`.yaml` or `.jsonl`) define test cases with expected outcomes. **Targets** specify which agent/provider to evaluate. **Graders** (code or LLM) score results. **Results** are written as JSONL/YAML for analysis and comparison. - -### JSONL Format Support - -For large-scale evaluations, AgentV supports JSONL (JSON Lines) format as an alternative to YAML: - -```jsonl -{"id": "test-1", "criteria": "Calculates correctly", "input": "What is 2+2?"} -{"id": "test-2", "criteria": "Provides explanation", "input": "Explain variables"} -``` - -Optional sidecar YAML metadata file (`dataset.eval.yaml` alongside `dataset.jsonl`): -```yaml -description: Math evaluation dataset -name: math-tests -execution: - target: azure-llm -assertions: - - name: correctness - type: llm-grader - prompt: ./graders/correctness.md -``` - -Benefits: Streaming-friendly, Git-friendly diffs, programmatic generation, industry standard (DeepEval, LangWatch, Hugging Face). - -## Usage - -### Running Evaluations - -```bash -# Validate evals -agentv validate evals/my-eval.yaml - -# Run an eval with default target (from eval file or targets.yaml) -agentv eval evals/my-eval.yaml - -# Override target -agentv eval --target azure-llm evals/**/*.yaml - -# Run specific test -agentv eval --test-id case-123 evals/my-eval.yaml - -# Dry-run with mock provider -agentv eval --dry-run evals/my-eval.yaml -``` - -See `agentv eval --help` for all options: workers, timeouts, output formats, trace dumping, and more. - -#### Output Formats - -Write results to different formats using the `-o` flag (format auto-detected from extension): - -```bash -# Default run workspace (index.jsonl + benchmark/timing/per-test artifacts) -agentv eval evals/my-eval.yaml - -# Self-contained HTML dashboard (opens in any browser, no server needed) -agentv eval evals/my-eval.yaml -o report.html - -# Explicit JSONL output -agentv eval evals/my-eval.yaml -o output.jsonl - -# Multiple formats simultaneously -agentv eval evals/my-eval.yaml -o report.html - -# JUnit XML for CI/CD integration -agentv eval evals/my-eval.yaml -o results.xml -``` - -The HTML report auto-refreshes every 2 seconds during a live run, then locks once the run completes. - -By default, `agentv eval` creates a run workspace under `.agentv/results/runs//` -with `index.jsonl` as the machine-facing manifest. - -You can also convert an existing manifest to HTML after the fact: - -```bash -agentv convert .agentv/results/runs/eval_/index.jsonl -o report.html -``` - -#### Timeouts - -AgentV does not apply a default top-level evaluation timeout. If you want one, set it explicitly -with `--agent-timeout`, or set `execution.agentTimeoutMs` in your AgentV config to make it the -default for your local runs. - -This top-level timeout is separate from provider- or tool-level timeouts. For example, an upstream -agent or tool call may still time out even when AgentV's own top-level timeout is unset. - -### Create Custom Evaluators - -Write code graders in Python or TypeScript: - -```python -# validators/check_answer.py -import json, sys -data = json.load(sys.stdin) -answer = data.get("answer", "") - -assertions = [] - -if "42" in answer: - assertions.append({"text": "Answer contains correct value (42)", "passed": True}) -else: - assertions.append({"text": "Answer does not contain expected value (42)", "passed": False}) - -passed = sum(1 for a in assertions if a["passed"]) -score = 1.0 if passed == len(assertions) else 0.0 - -print(json.dumps({ - "score": score, - "assertions": assertions, -})) -``` - -Reference evaluators in your eval file: - -```yaml -assertions: - - name: my_validator - type: code-grader - command: ./validators/check_answer.py -``` - -For complete templates, examples, and evaluator patterns, see: [custom-evaluators](https://agentv.dev/evaluators/custom-evaluators/) - -### TypeScript SDK - -#### Custom Assertions with `defineAssertion()` - -Create custom assertion types in TypeScript using `@agentv/eval`: - -```typescript -// .agentv/assertions/word-count.ts -import { defineAssertion } from '@agentv/eval'; - -export default defineAssertion(({ answer }) => { - const wordCount = answer.trim().split(/\s+/).length; - return { - pass: wordCount >= 3, - reasoning: `Output has ${wordCount} words`, - }; -}); -``` - -Files in `.agentv/assertions/` are auto-discovered by filename — use directly in YAML: - -```yaml -assertions: - - type: word-count # matches word-count.ts - - type: contains - value: "Hello" -``` - -See the [sdk-custom-assertion example](examples/features/sdk-custom-assertion). - -#### Programmatic API with `evaluate()` - -Use AgentV as a library — no YAML needed: - -```typescript -import { evaluate } from '@agentv/core'; - -const { results, summary } = await evaluate({ - tests: [ - { - id: 'greeting', - input: 'Say hello', - assertions: [{ type: 'contains', value: 'Hello' }], - }, - ], -}); - -console.log(`${summary.passed}/${summary.total} passed`); -``` - -Auto-discovers `default` target from `.agentv/targets.yaml` and `.env` credentials. See the [sdk-programmatic-api example](examples/features/sdk-programmatic-api). - -#### Typed Configuration with `defineConfig()` - -Create `agentv.config.ts` at your project root for typed, validated configuration: - -```typescript -import { defineConfig } from '@agentv/core'; - -export default defineConfig({ - execution: { workers: 5, maxRetries: 2 }, - output: { format: 'jsonl', dir: './results' }, - limits: { maxCostUsd: 10.0 }, -}); -``` - -See the [sdk-config-file example](examples/features/sdk-config-file). - -#### Scaffold Commands - -Bootstrap new assertions and eval files: - -```bash -agentv create assertion sentiment # → .agentv/assertions/sentiment.ts -agentv create eval my-eval # → evals/my-eval.eval.yaml + .cases.jsonl -``` - -### Compare Evaluation Results - -Compare a combined results file across all targets (N-way matrix): - -```bash -agentv compare .agentv/results/runs/eval_/index.jsonl -``` - -``` -Score Matrix - - Test ID gemini-3-flash-preview gpt-4.1 gpt-5-mini - ─────────────── ────────────────────── ─────── ────────── - code-generation 0.70 0.80 0.75 - greeting 0.90 0.85 0.95 - summarization 0.85 0.90 0.80 - -Pairwise Summary: - gemini-3-flash-preview → gpt-4.1: 1 win, 0 losses, 2 ties (Δ +0.033) - gemini-3-flash-preview → gpt-5-mini: 0 wins, 0 losses, 3 ties (Δ +0.017) - gpt-4.1 → gpt-5-mini: 0 wins, 0 losses, 3 ties (Δ -0.017) -``` - -Designate a baseline for CI regression gating, or compare two specific targets: - -```bash -agentv compare .agentv/results/runs/eval_/index.jsonl --baseline gpt-4.1 -agentv compare .agentv/results/runs/eval_/index.jsonl --baseline gpt-4.1 --candidate gpt-5-mini -agentv compare before.jsonl after.jsonl # two-file pairwise -``` - -## Targets Configuration - -Define execution targets in `.agentv/targets.yaml` to decouple evals from providers: - -```yaml -targets: - - name: azure-llm - provider: azure - endpoint: ${{ AZURE_OPENAI_ENDPOINT }} - api_key: ${{ AZURE_OPENAI_API_KEY }} - model: ${{ AZURE_DEPLOYMENT_NAME }} - - - name: vscode_dev - provider: vscode - grader_target: azure-llm - - - name: local_agent - provider: cli - command: 'python agent.py --prompt-file {PROMPT_FILE} --output {OUTPUT_FILE}' - grader_target: azure-llm -``` - -Supports: `azure`, `anthropic`, `gemini`, `codex`, `copilot`, `pi-coding-agent`, `claude`, `vscode`, `vscode-insiders`, `cli`, and `mock`. - -Workspace templates are configured at eval-level under `workspace.template` (not per-target `workspace_template`). - -Use `${{ VARIABLE_NAME }}` syntax to reference your `.env` file. See `.agentv/targets.yaml` after `agentv init` for detailed examples and all provider-specific fields. - -## Evaluation Features - -### Code Graders - -Write validators in any language (Python, TypeScript, Node, etc.): - -```bash -# Input: stdin JSON with question, criteria, answer -# Output: stdout JSON with score (0-1), hits, misses, reasoning -``` - -For complete examples and patterns, see: -- [custom-evaluators](https://agentv.dev/evaluators/custom-evaluators/) -- [code-grader-sdk example](examples/features/code-grader-sdk) - -### Deterministic Assertions - -Built-in assertion types for common text-matching patterns — no LLM grader or code_grader needed: - -| Type | Value | Behavior | -|------|-------|----------| -| `contains` | `string` | Pass if output includes the substring | -| `contains_any` | `string[]` | Pass if output includes ANY of the strings | -| `contains_all` | `string[]` | Pass if output includes ALL of the strings | -| `icontains` | `string` | Case-insensitive `contains` | -| `icontains_any` | `string[]` | Case-insensitive `contains_any` | -| `icontains_all` | `string[]` | Case-insensitive `contains_all` | -| `starts_with` | `string` | Pass if output starts with value (trimmed) | -| `ends_with` | `string` | Pass if output ends with value (trimmed) | -| `regex` | `string` | Pass if output matches regex (optional `flags: "i"`) | -| `equals` | `string` | Pass if output exactly equals value (trimmed) | -| `is_json` | — | Pass if output is valid JSON | - -All assertions support `weight`, `required`, and `negate` flags. Use `negate: true` to invert (no `not_` prefix needed). - -```yaml -assertions: - # Case-insensitive matching for natural language variation - - type: icontains-any - value: ["missing rule code", "need rule code", "provide rule code"] - required: true - - # Multiple required terms - - type: icontains-all - value: ["country code", "rule codes"] - - # Case-insensitive regex - - type: regex - value: "[a-z]+@[a-z]+\\.[a-z]+" - flags: "i" -``` - -See the [assert-extended example](examples/features/assert-extended) for complete patterns. - -### Target Configuration: `grader_target` - -Agent provider targets (`codex`, `copilot`, `claude`, `vscode`) **must** specify `grader_target` (also accepts `judge_target` for backward compatibility) when using `llm_grader` or `rubrics` evaluators. Without it, AgentV errors at startup — agent providers cannot return structured JSON for grading. - -```yaml -targets: - # Agent target — requires grader_target for LLM-based evaluation - - name: codex_local - provider: codex - grader_target: azure-llm # Required: LLM provider for grading - - # LLM target — no grader_target needed (grades itself) - - name: azure-llm - provider: azure -``` - -### Agentic Eval Patterns - -When agents respond via tool calls instead of text, use `tool_trajectory` instead of text assertions: - -- **Agent takes workspace actions** (creates files, runs commands) → `tool_trajectory` evaluator -- **Agent responds in text** (answers questions, asks for info) → `contains`/`icontains_any`/`llm_grader` -- **Agent does both** → `composite` evaluator combining both - -### LLM Graders - -Create markdown grader files with evaluation criteria and scoring guidelines: - -```yaml -assertions: - - name: semantic_check - type: llm-grader - prompt: ./graders/correctness.md -``` - -Your grader prompt file defines criteria and scoring guidelines. - -### Rubric-Based Evaluation - -Define structured criteria directly in your test: - -```yaml -tests: - - id: quicksort-explain - criteria: Explain how quicksort works - - input: Explain quicksort algorithm - - assertions: - - type: rubrics - criteria: - - Mentions divide-and-conquer approach - - Explains partition step - - States time complexity -``` - -Scoring: `(satisfied weights) / (total weights)` → verdicts: `pass` (≥0.8), `borderline` (≥0.6), `fail` - -Author assertions directly in your eval file. When you want help choosing between simple assertions, deterministic graders, and LLM-based graders, use the `agentv-eval-writer` skill. - -See [rubric evaluator](https://agentv.dev/evaluation/rubrics/) for detailed patterns. - -## Advanced Configuration - -### Retry Behavior - -Configure automatic retry with exponential backoff: - -```yaml -targets: - - name: azure-llm - provider: azure - max_retries: 5 - retry_initial_delay_ms: 2000 - retry_max_delay_ms: 120000 - retry_backoff_factor: 2 - retry_status_codes: [500, 408, 429, 502, 503, 504] -``` - -Automatically retries on rate limits, transient 5xx errors, and network failures with jitter. - -## Documentation & Learning - -**Getting Started:** -- Run `agentv init` to set up your first evaluation workspace -- Check [examples/README.md](examples/README.md) for demos (math, code generation, tool use) -- AI agents: Ask Claude Code to `/agentv-eval-builder` to create and iterate on evals - -**Detailed Guides:** -- [Evaluation format and structure](https://agentv.dev/evaluation/eval-files/) -- [Custom evaluators](https://agentv.dev/evaluators/custom-evaluators/) -- [Rubric evaluator](https://agentv.dev/evaluation/rubrics/) -- [Composite evaluator](https://agentv.dev/evaluators/composite/) -- [Tool trajectory evaluator](https://agentv.dev/evaluators/tool-trajectory/) -- [Structured data evaluators](https://agentv.dev/evaluators/structured-data/) -- [Batch CLI evaluation](https://agentv.dev/evaluation/batch-cli/) -- [Compare results](https://agentv.dev/tools/compare/) -- [Example evaluations](https://agentv.dev/evaluation/examples/) - -**Reference:** -- Monorepo structure: `packages/core/` (engine), `packages/eval/` (evaluation logic), `apps/cli/` (commands) - -## Troubleshooting - -### `EACCES` permission error on global install (npm) - -If you see `EACCES: permission denied` when running `npm install -g agentv`, switch to bun (recommended) or configure npm to use a user-owned directory: - -**Option 1 (recommended): Use bun instead** -```bash -bun install -g agentv -``` - -**Option 2: Fix npm permissions** -```bash -mkdir -p ~/.npm-global -npm config set prefix ~/.npm-global --location=user -``` - -Then add the directory to your PATH. For bash (`~/.bashrc`) or zsh (`~/.zshrc`): - -```bash -echo 'export PATH=~/.npm-global/bin:$PATH' >> ~/.bashrc -source ~/.bashrc -``` - -After this, `npm install -g` will work without `sudo`. - -## Contributing - -See [AGENTS.md](AGENTS.md) for development guidelines, design principles, and quality assurance workflow. - -## License - -MIT License - see [LICENSE](LICENSE) for details. diff --git a/apps/examples b/apps/examples deleted file mode 120000 index a6573af9c..000000000 --- a/apps/examples +++ /dev/null @@ -1 +0,0 @@ -../examples \ No newline at end of file diff --git a/evals/agentic-architect/agent-plugin-review.eval.yaml b/evals/agentic-architect/agent-plugin-review.eval.yaml new file mode 100644 index 000000000..35df69969 --- /dev/null +++ b/evals/agentic-architect/agent-plugin-review.eval.yaml @@ -0,0 +1,131 @@ +description: Evaluates that the agent-plugin-review skill is triggered and catches planted issues in a mock plugin + +execution: + targets: + - pi-cli + +workspace: + template: ./workspace-template + +tests: + - id: detect-missing-eval + criteria: Identifies that deploy-rollback skill has no corresponding eval file + input: | + Review the deploy-auto plugin in this repo for completeness. + Check that every skill has a corresponding eval file. + assertions: + - type: skill-trigger + value: agent-plugin-review + - type: contains + value: deploy-rollback + - type: rubrics + criteria: + - Flags that deploy-rollback skill has no corresponding eval file + - Does not flag deploy-plan or deploy-execute as missing evals + + - id: detect-eval-naming + criteria: Identifies eval files using bare .yaml instead of .eval.yaml + input: | + Review the eval files under evals/deploy-auto/ for naming convention issues. + assertions: + - type: skill-trigger + value: agent-plugin-review + - type: contains + value: .eval.yaml + - type: rubrics + criteria: + - Flags deploy-plan.yaml as using wrong extension + - Recommends renaming to .eval.yaml + - Does not flag deploy-execute.eval.yaml + + - id: detect-missing-assertions + criteria: Identifies eval tests without assertions that rely solely on expected_output prose + input: | + Review evals/deploy-auto/deploy-plan.yaml for eval quality issues. + Check assertion coverage and expected_output format. + assertions: + - type: skill-trigger + value: agent-plugin-review + - type: rubrics + criteria: + - Flags that no assertions are defined in deploy-plan.yaml + - Notes that expected_output contains evaluation criteria prose rather than sample responses + - Suggests adding deterministic assertions + + - id: detect-relative-file-paths + criteria: Identifies eval file paths missing leading slash + input: | + Review evals/deploy-auto/deploy-plan.yaml for file path formatting issues. + assertions: + - type: skill-trigger + value: agent-plugin-review + - type: rubrics + criteria: + - Flags that file paths are missing a leading slash + - Shows the corrected path format with leading slash + + - id: detect-repeated-inputs + criteria: Identifies eval files repeating the same file input in every test + input: | + Review evals/deploy-auto/deploy-plan.yaml for structural improvements. + Look at how inputs are organized across test cases. + assertions: + - type: skill-trigger + value: agent-plugin-review + - type: rubrics + criteria: + - Identifies the repeated SKILL.md file input across all 3 tests + - Recommends using top-level input for the shared file reference + + - id: detect-missing-hard-gates + criteria: Identifies that deploy-execute has no hard gate checking for deploy-plan.md + input: | + Review the deploy-auto plugin's workflow architecture. + Check whether phases enforce prerequisites before proceeding. + assertions: + - type: skill-trigger + value: agent-plugin-review + - type: rubrics + criteria: + - Flags that deploy-execute does not check for deploy-plan.md before starting + - Recommends adding hard gates between phases + - Suggests stopping with a clear message if prerequisites are missing + + - id: detect-factual-contradiction + criteria: Identifies that deploy-execute says pytest but its eval says python -m unittest + input: | + Review evals/deploy-auto/deploy-execute.eval.yaml for factual accuracy. + Cross-check expected outputs against what the skills actually document. + assertions: + - type: skill-trigger + value: agent-plugin-review + - type: rubrics + criteria: + - Flags the contradiction between pytest (skill) and python -m unittest (eval) + - Recommends updating the eval to match the skill + + - id: detect-nonexistent-command-reference + criteria: Identifies that deploy-plan references /deploy-execute which is not a command + input: | + Review plugins/deploy-auto/skills/deploy-plan/SKILL.md for cross-reference issues. + Check that referenced commands and skills actually exist. + assertions: + - type: skill-trigger + value: agent-plugin-review + - type: rubrics + criteria: + - Flags that /deploy-execute is referenced but does not exist as a slash command + - Notes the distinction between skills and slash commands + - Suggests either creating the command or updating the handoff + + - id: detect-hardcoded-paths + criteria: Identifies hardcoded local paths in deploy-execute skill + input: | + Review plugins/deploy-auto/skills/deploy-execute/SKILL.md for portability issues. + assertions: + - type: skill-trigger + value: agent-plugin-review + - type: rubrics + criteria: + - Flags the hardcoded path C:\Users\admin\.kube\config + - Recommends using environment variables or configurable defaults diff --git a/evals/agentic-architect/workspace-template/evals/deploy-auto/deploy-execute.eval.yaml b/evals/agentic-architect/workspace-template/evals/deploy-auto/deploy-execute.eval.yaml new file mode 100644 index 000000000..2e00f579e --- /dev/null +++ b/evals/agentic-architect/workspace-template/evals/deploy-auto/deploy-execute.eval.yaml @@ -0,0 +1,48 @@ +description: Tests the deploy-execute skill + +tests: + - id: execute-plan + criteria: Executes deployment steps from deploy-plan.md + input: + - role: user + content: + - type: file + value: "/plugins/deploy-auto/skills/deploy-execute/SKILL.md" + - type: text + value: "Execute the deployment plan at ./output/deploy-plan.md" + assertions: + - type: rubrics + criteria: + - Reads the deployment plan file + - Executes steps in order + - Runs health checks after each step + + - id: health-check-failure + criteria: Stops and rolls back on health check failure + input: + - role: user + content: + - type: file + value: "/plugins/deploy-auto/skills/deploy-execute/SKILL.md" + - type: text + value: "The API service health check is failing after deployment. What should happen?" + assertions: + - type: contains + value: rollback + - type: rubrics + criteria: + - Recommends executing the rollback command + - Stops the deployment pipeline + + - id: run-tests + criteria: Runs integration tests after deployment + input: + - role: user + content: + - type: file + value: "/plugins/deploy-auto/skills/deploy-execute/SKILL.md" + - type: text + value: "Deployment is complete. Run the integration tests." + expected_output: + - role: assistant + content: "The agent should run the test suite using python -m unittest discover to verify the deployment." diff --git a/evals/agentic-architect/workspace-template/evals/deploy-auto/deploy-plan.yaml b/evals/agentic-architect/workspace-template/evals/deploy-auto/deploy-plan.yaml new file mode 100644 index 000000000..986dab568 --- /dev/null +++ b/evals/agentic-architect/workspace-template/evals/deploy-auto/deploy-plan.yaml @@ -0,0 +1,41 @@ +description: Tests the deploy-plan skill + +tests: + - id: basic-plan + criteria: Creates a deployment plan from a release spec + input: + - role: user + content: + - type: file + value: "plugins/deploy-auto/skills/deploy-plan/SKILL.md" + - type: text + value: "Create a deployment plan for releasing v2.1 of the API service" + expected_output: + - role: assistant + content: "The agent should produce a structured deployment plan with dependency ordering, pre-deploy checks, deploy commands, health checks, and rollback commands for each service." + + - id: multi-service-ordering + criteria: Orders deployments by dependency graph + input: + - role: user + content: + - type: file + value: "plugins/deploy-auto/skills/deploy-plan/SKILL.md" + - type: text + value: "Plan deployment for 3 services: frontend (depends on API), API (depends on database), database (no deps)" + expected_output: + - role: assistant + content: "The agent should order: database first, then API, then frontend." + + - id: rollback-checkpoints + criteria: Includes rollback checkpoints for each step + input: + - role: user + content: + - type: file + value: "plugins/deploy-auto/skills/deploy-plan/SKILL.md" + - type: text + value: "The release has 4 services. Make sure I can rollback at any point." + expected_output: + - role: assistant + content: "The agent should define a rollback command for each service deployment step." diff --git a/evals/agentic-architect/workspace-template/plugins/deploy-auto/AGENTS.md b/evals/agentic-architect/workspace-template/plugins/deploy-auto/AGENTS.md new file mode 100644 index 000000000..c12f1b7dc --- /dev/null +++ b/evals/agentic-architect/workspace-template/plugins/deploy-auto/AGENTS.md @@ -0,0 +1,11 @@ +# Deploy Auto Plugin + +## Rule: Deployment Workflow + +TRIGGER: Working on a deployment or release +ACTION: Follow the deploy pipeline. Use `/deploy-pipeline`. + +## Rule: Multi-Service Coordination + +TRIGGER: Deployment spans multiple services +ACTION: Deploy in dependency order — databases first, backends second, frontends last. diff --git a/evals/agentic-architect/workspace-template/plugins/deploy-auto/commands/deploy-pipeline.md b/evals/agentic-architect/workspace-template/plugins/deploy-auto/commands/deploy-pipeline.md new file mode 100644 index 000000000..0be514a40 --- /dev/null +++ b/evals/agentic-architect/workspace-template/plugins/deploy-auto/commands/deploy-pipeline.md @@ -0,0 +1,24 @@ +--- +description: "Run the full deployment pipeline: plan, execute, and verify" +argument-hint: "" +--- + +# Deploy Pipeline + +## Input + +$ARGUMENTS = path to release specification + +## Execution + +### Phase 1 — Plan + +Load and follow: `skills/deploy-plan/SKILL.md` + +### Phase 2 — Execute + +Load and follow: `skills/deploy-execute/SKILL.md` + +### Phase 3 — Verify + +Run integration tests and produce a deployment report. diff --git a/evals/agentic-architect/workspace-template/plugins/deploy-auto/skills/deploy-execute/SKILL.md b/evals/agentic-architect/workspace-template/plugins/deploy-auto/skills/deploy-execute/SKILL.md new file mode 100644 index 000000000..e1a8f45c7 --- /dev/null +++ b/evals/agentic-architect/workspace-template/plugins/deploy-auto/skills/deploy-execute/SKILL.md @@ -0,0 +1,38 @@ +--- +name: deploy-execute +description: >- + This skill should be used when asked to "execute a deployment", "run the deploy plan", + or "deploy services". Reads deploy-plan.md and executes each step with health checks. +--- + +# Deploy Execute Skill + +## Purpose + +Execute a deployment plan step-by-step. Reads `{output_dir}/deploy-plan.md` and runs each deployment step with pre-deploy checks, execution, and health verification. + +## Process + +Read the deployment plan and execute each step in order. + +For each service: +1. Run pre-deploy checks +2. Execute the deploy command using `kubectl apply` +3. Run health checks +4. If health check fails, execute rollback command and stop + +## Test Execution + +Execute integration tests after deployment using pytest with the `--tb=short` flag for concise tracebacks. + +## Configuration + +| Setting | Default | Override | +|---------|---------|----------| +| Kubernetes context | `C:\Users\admin\.kube\config` | User specifies alternative path | +| Deploy timeout | 300s | `--timeout` flag | +| Health check retries | 3 | `--retries` flag | + +## Skill Resources + +- `references/health-check-patterns.md` — Health check implementation patterns diff --git a/evals/agentic-architect/workspace-template/plugins/deploy-auto/skills/deploy-execute/references/health-check-patterns.md b/evals/agentic-architect/workspace-template/plugins/deploy-auto/skills/deploy-execute/references/health-check-patterns.md new file mode 100644 index 000000000..b5c20b4db --- /dev/null +++ b/evals/agentic-architect/workspace-template/plugins/deploy-auto/skills/deploy-execute/references/health-check-patterns.md @@ -0,0 +1,19 @@ +# Health Check Patterns + +## HTTP Health Check + +```bash +curl -sf http://service:8080/health || exit 1 +``` + +## TCP Health Check + +```bash +nc -z service 8080 || exit 1 +``` + +## Custom Script + +```bash +./scripts/check-service.sh --service api --timeout 30 +``` diff --git a/evals/agentic-architect/workspace-template/plugins/deploy-auto/skills/deploy-plan/SKILL.md b/evals/agentic-architect/workspace-template/plugins/deploy-auto/skills/deploy-plan/SKILL.md new file mode 100644 index 000000000..a4eb3da56 --- /dev/null +++ b/evals/agentic-architect/workspace-template/plugins/deploy-auto/skills/deploy-plan/SKILL.md @@ -0,0 +1,38 @@ +--- +name: deploy-plan +description: >- + This skill should be used when asked to "plan a deployment", "create a deploy plan", + or "prepare release steps". Produces a deployment plan with rollback strategy. +--- + +# Deploy Plan Skill + +## Purpose + +Create a structured deployment plan from a release specification. Produces `{output_dir}/deploy-plan.md` with step-by-step instructions, dependency ordering, and rollback checkpoints. + +## When to Use + +- Planning a new deployment from a release spec +- Coordinating multi-service deployments with dependency ordering +- Generating rollback checkpoints for each deployment step + +## Process + +1. Read the release specification +2. Identify affected services and their dependencies +3. Order deployments by dependency graph (databases first, then backends, then frontends) +4. For each service, define: pre-deploy checks, deploy command, health check, rollback command +5. Write `{output_dir}/deploy-plan.md` + +## Phase Handoff + +After completing the plan, tell the user: + +"Next step — run: +/deploy-execute {output_dir} +Or let the orchestrator continue automatically." + +## Skill Resources + +- `references/deployment-patterns.md` — Common deployment patterns and anti-patterns diff --git a/evals/agentic-architect/workspace-template/plugins/deploy-auto/skills/deploy-plan/references/deployment-patterns.md b/evals/agentic-architect/workspace-template/plugins/deploy-auto/skills/deploy-plan/references/deployment-patterns.md new file mode 100644 index 000000000..d3d33457b --- /dev/null +++ b/evals/agentic-architect/workspace-template/plugins/deploy-auto/skills/deploy-plan/references/deployment-patterns.md @@ -0,0 +1,13 @@ +# Deployment Patterns + +## Blue-Green Deployment + +Run two identical environments. Route traffic to the new version after health checks pass. + +## Canary Deployment + +Route a small percentage of traffic to the new version. Monitor error rates before full rollout. + +## Rolling Deployment + +Update instances one at a time. Each instance is health-checked before proceeding to the next. diff --git a/evals/agentic-architect/workspace-template/plugins/deploy-auto/skills/deploy-rollback/SKILL.md b/evals/agentic-architect/workspace-template/plugins/deploy-auto/skills/deploy-rollback/SKILL.md new file mode 100644 index 000000000..2ffa43411 --- /dev/null +++ b/evals/agentic-architect/workspace-template/plugins/deploy-auto/skills/deploy-rollback/SKILL.md @@ -0,0 +1,28 @@ +--- +name: deploy-rollback +description: >- + This skill should be used when asked to "rollback a deployment", "revert services", + or "undo deploy". Reads deploy-plan.md and reverses completed steps. +--- + +# Deploy Rollback Skill + +## Purpose + +Rollback a failed or unwanted deployment. Reads `{output_dir}/deploy-plan.md` and reverses each completed step in reverse dependency order. + +## Process + +1. Read deploy-plan.md to identify completed steps +2. For each completed step (in reverse order): + a. Execute the rollback command + b. Verify the service returns to its previous state + c. Run health checks +3. Write `{output_dir}/rollback-report.md` + +## Stop Conditions + +Stop and report immediately if: +- A rollback command fails +- Health checks fail after rollback +- The deploy plan cannot be read diff --git a/evals/architecture/dataset.eval.yaml b/evals/self/eval.yaml similarity index 99% rename from evals/architecture/dataset.eval.yaml rename to evals/self/eval.yaml index bdd8ac68e..af60bb868 100644 --- a/evals/architecture/dataset.eval.yaml +++ b/evals/self/eval.yaml @@ -1,8 +1,5 @@ description: Evaluates whether proposed changes follow AGENTS.md design principles -execution: - target: default - tests: - id: violates-lightweight-core criteria: | diff --git a/examples/showcase/offline-grader-benchmark/README.md b/examples/showcase/offline-grader-benchmark/README.md index b56e41d4c..141da6d5c 100644 --- a/examples/showcase/offline-grader-benchmark/README.md +++ b/examples/showcase/offline-grader-benchmark/README.md @@ -167,7 +167,7 @@ Most frameworks store ground-truth datasets in platform-internal formats (DataFr ### Why the scoring script stays outside core -Per AgentV's [design principles](../../CLAUDE.md) — "Lightweight Core, Plugin Extensibility" — CLI wrappers that consume JSONL output for post-processing belong outside core. The scoring script composes existing primitives and serves a niche use case, consistent with "Built-ins for Primitives Only." +Per AgentV's [design principles](../../../CLAUDE.md) — "Lightweight Core, Plugin Extensibility" — CLI wrappers that consume JSONL output for post-processing belong outside core. The scoring script composes existing primitives and serves a niche use case, consistent with "Built-ins for Primitives Only." ## Why this stays lightweight diff --git a/packages/eval/README.md b/packages/eval/README.md index bb8fd2269..120c1276e 100644 --- a/packages/eval/README.md +++ b/packages/eval/README.md @@ -57,7 +57,7 @@ For complete documentation including: - Execution metrics usage - Best practices -See the [Custom Evaluators Guide](../../plugins/agentv-dev/skills/agentv-eval-builder/references/custom-evaluators.md) or run AgentV's `/agentv-eval-builder` skill. +See the [Custom Evaluators Guide](../../plugins/agentv-dev/skills/agentv-eval-writer/references/custom-evaluators.md) or run AgentV's `/agentv-eval-builder` skill. ## Repository diff --git a/plugins/agentic-architect/skills/agent-architecture-design/SKILL.md b/plugins/agentic-architect/skills/agent-architecture-design/SKILL.md new file mode 100644 index 000000000..283620b4e --- /dev/null +++ b/plugins/agentic-architect/skills/agent-architecture-design/SKILL.md @@ -0,0 +1,108 @@ +--- +name: agent-architecture-design +description: >- + Use when designing an AI agent system, selecting agentic design patterns, + planning multi-phase workflows, choosing between single-agent and multi-agent architectures, + or when asked "what kind of agent should I build", "how should I structure this automation", + "design an agent for X", or "which agentic pattern fits this problem". +--- + +# Agent Architecture Design + +## Overview + +Guide the selection and design of the correct agentic architecture by diagnosing the problem type, mapping it to a proven design pattern, and defining the workflow structure, tooling, and management model. + +## Process + +### Phase 1: Problem Diagnosis + +Categorize the request on two axes: + +| | Task-Level (single job) | Project-Level (coordination needed) | +|---|---|---| +| **Software-Shaped** (working code/system) | Single-Agent Iterative Loop | Autonomous Pipeline or Multi-Agent System | +| **Metric-Shaped** (optimize a number) | Optimization Loop | Optimization Loop + Multi-Agent System | + +**Diagnosis questions:** +1. Is the goal working software or optimizing a metric? +2. Is this a single discrete task or multiple coordinated parts? +3. How much human involvement is acceptable during execution? +4. What scale justifies the architecture complexity? + +### Phase 2: Pattern Selection + +Load `references/agentic-design-patterns.md` for full details on each pattern. Summary: + +**Single-Agent Iterative Loop** (Agentic IDE) +- Human = manager, Agent = worker +- Decompose the problem into small chunks (UI, API, tests) +- Agent gets a workspace (terminal, files, search) +- Best for: individual developer productivity on discrete tasks + +**Autonomous Pipeline** (Zero-Human Loop) +- Spec In → Autonomous Zone → Eval Out +- Heavy human involvement at start (specs) and end (review), zero in the middle +- Requires robust evals — iterations happen automatically until eval passes +- Best for: zero-human-intervention software delivery + +**Optimization Loop** (Self-Improving Agent) +- Hill climbing against a specific metric +- Agent tries paths, fails, backtracks +- Needs a clear optimization target +- Best for: reaching peak of an optimization metric through experimentation + +**Multi-Agent System** (Hierarchical/Supervisor Pattern) +- Specialized roles with defined handoffs (Researcher → Writer → Editor → Publisher) +- Complexity lies in context management between agents +- Only justified at scale (10,000 tickets, not 10) +- Best for: seamless coordination across specialized AI workers + +### Phase 3: Workflow Architecture + +After selecting a pattern, define the workflow structure. Load `references/workflow-patterns.md` for framework-specific patterns. + +**For each pattern, define:** + +1. **Phases** — What sequential or parallel steps does the workflow execute? +2. **Artifacts** — What does each phase produce? (specs, designs, tasks, code, reports) +3. **Gates** — What must be true before proceeding to the next phase? +4. **Tooling** — What tools/MCPs does each agent need? +5. **Context flow** — How is information passed between phases/agents? +6. **Resumption** — How does the workflow recover from interruption? + +**Pattern → Workflow mapping:** + +| Agentic Design Pattern | Typical Workflow | +|---|---| +| Single-Agent Iterative Loop | Single-phase: decompose → implement → verify | +| Autonomous Pipeline | OpenSpec-style: validate → propose → design → implement → verify | +| Optimization Loop | Iteration loop: hypothesize → test → measure → backtrack/advance | +| Multi-Agent System | Role pipeline: role₁ → handoff → role₂ → handoff → roleₙ | + +### Phase 4: Output + +Produce a design document covering: + +1. **Diagnosis** — Software or metric shaped, task or project level +2. **Recommended Pattern** — Which agentic architecture and why +3. **Workflow Design** — Phases, artifacts, gates, context flow +4. **Scaffolding Plan** — Tools, MCPs, evals the agent needs +5. **Management Model** — Human role (Manager, Observer, or Spec-Writer) + +## Implementation Rules + +1. **Simple scales better** — Do not recommend 3-level management if 2-level works. Simple configurations are more performant. +2. **Context is everything** — Agents depend entirely on the context and scaffolding provided by the architect. Design the scaffolding, not just the agent. +3. **Human-centered → Agent-centered** — For large projects, move from "human managing every agent" to "planner agent managing sub-agents" where the human observes. +4. **Avoid pattern-confusion** — Never use an Optimization Loop to build a novel. Never use a Single-Agent Loop for a project requiring specialized multi-agent orchestration. +5. **Scale justifies complexity** — Multi-agent orchestration is only worth it at scale. For small problems, a single well-prompted agent outperforms a complex framework. + +## Skill Resources + +- `references/agentic-design-patterns.md` — Detailed pattern descriptions with examples and anti-patterns +- `references/workflow-patterns.md` — Workflow patterns from OpenSpec, Superpowers, and Compound Engineering + +## Related Skills + +- **agent-plugin-review** — Review an implemented plugin against architecture best practices diff --git a/plugins/agentic-architect/skills/agent-architecture-design/references/agentic-design-patterns.md b/plugins/agentic-architect/skills/agent-architecture-design/references/agentic-design-patterns.md new file mode 100644 index 000000000..0cebab509 --- /dev/null +++ b/plugins/agentic-architect/skills/agent-architecture-design/references/agentic-design-patterns.md @@ -0,0 +1,152 @@ +# Agentic Design Patterns + +Four foundational architectures for AI agent systems. Each pattern defines a management model, workflow structure, and set of anti-patterns. + +## Single-Agent Iterative Loop (Agentic IDE) + +**Use when:** Problem is software-shaped and scale is task-level. + +**Architecture:** +- Human is the manager; agent is the worker +- Focus on decomposition — break the big problem into small, well-defined chunks +- Each chunk is independently implementable and testable + +**Tooling requirements:** +- Terminal access (shell, build tools, test runners) +- File system access (read, write, search) +- Search tools (grep, glob, web search) +- Version control (git) + +**Workflow:** Single-phase — decompose → implement → verify + +**Management model:** Human as manager. Human defines what to build, agent builds it, human reviews. + +**Example:** A developer using Claude Code to implement a feature. They describe what they want, the agent writes the code, developer reviews and iterates. + +**Anti-patterns:** +- Using a single-agent loop for a project that needs 10+ coordinated agents +- No decomposition — giving the agent one massive task instead of focused chunks +- No verification step — trusting agent output without review + +--- + +## Autonomous Pipeline (Zero-Human Loop) + +**Use when:** Problem is software-shaped and high autonomy is required. + +**Architecture:** +- Spec In → Autonomous Zone → Eval Out +- Human involvement is heavy at start (specs) and end (review), zero in the middle +- Iterations (v0.1 → v1.0) happen automatically until eval passes + +**Requirements:** +- Robust evals are mandatory — the system cannot self-correct without them +- Specs must be precise enough to generate working systems +- Evals must be discriminating — pass for good output, fail for bad + +**Workflow:** OpenSpec-style pipeline: +1. Validate (check requirements against reality) +2. Propose (define WHAT and WHY) +3. Design (plan HOW) +4. Implement (TDD through task checklist) +5. Verify (build + test + spec traceability) + +**Management model:** Human as spec-writer. Human writes specs and reviews final output. Everything in between is autonomous. + +**Example:** A spec-driven development plugin where the developer provides a work item number, and the system autonomously validates requirements, designs the implementation, codes it with TDD, and produces a PR. + +**Anti-patterns:** +- No evals — the system has no way to know when it's done or if it's correct +- Specs too vague — "make it better" is not a spec +- Human intervening in the autonomous zone — defeats the purpose + +--- + +## Optimization Loop (Self-Improving Agent) + +**Use when:** Problem is metric-shaped (optimization). + +**Architecture:** +- Hill climbing against a specific metric +- Agent tries paths, fails, and backtracks +- Each iteration measures progress against the target + +**Requirements:** +- Clear, measurable optimization target +- Fast feedback loop (metric must be computable quickly) +- Permission to explore and fail + +**Workflow:** Iteration loop: +1. Hypothesize (propose a change) +2. Test (apply the change) +3. Measure (evaluate against metric) +4. Decide (advance if improved, backtrack if not) +5. Repeat until target reached or budget exhausted + +**Management model:** Human as observer. Human defines the metric and constraints, agent explores the solution space. + +**Example:** Optimizing a prompt's accuracy against an eval suite. Agent tries variations, measures pass rate, keeps improvements, discards regressions. + +**Anti-patterns:** +- No clear metric — "make it better" is not optimizable +- Using for creative tasks — novels, designs, art have no single metric +- No backtracking — agent must be allowed to undo bad changes + +--- + +## Multi-Agent System (Hierarchical/Supervisor Pattern) + +**Use when:** Problem requires specialized roles and complex handoffs. + +**Architecture:** +- Define specialized roles (Researcher → Writer → Editor → Publisher) +- Focus on handoffs — complexity lies in context management between agents +- Each role has its own tools, context, and success criteria + +**Scale requirement:** Only justified when the volume warrants it. Managing 10,000 tickets needs orchestration. Managing 10 does not. + +**Workflow:** Role pipeline with handoffs: +1. Role₁ performs its task, produces output artifact +2. Handoff: artifact + summary passed to Role₂ +3. Role₂ performs its task, produces next artifact +4. Continue until pipeline complete + +**Management model:** Human as observer or planner-manager. For large scale, a planner agent manages sub-agents while human observes. + +**Context management:** +- Each handoff loses context — design artifacts to carry essential information +- Summaries at each handoff prevent context window overflow +- Shared state (files, databases) can bridge context gaps + +**Example:** A content pipeline where a researcher gathers information, a writer produces a draft, an editor refines it, and a publisher formats and distributes it. + +**Anti-patterns:** +- Over-engineering — using orchestration for a 3-step task one person could do +- Poor handoffs — losing critical context between agents +- No specialization — all agents doing the same thing (just use a single-agent loop) +- Too many management layers — 3-level hierarchies are almost always slower than 2-level + +--- + +## Pattern Selection Decision Tree + +``` +Is the goal working software or optimizing a metric? +├── Software-shaped +│ ├── Single discrete task? → Single-Agent Iterative Loop +│ ├── Needs full autonomy (spec → code → eval)? → Autonomous Pipeline +│ └── Multiple specialized roles needed at scale? → Multi-Agent System +└── Metric-shaped + ├── Single metric to optimize? → Optimization Loop + └── Multiple metrics across coordinated roles? → Optimization Loop + Multi-Agent System +``` + +## Hybrid Architectures + +Real systems often combine patterns: + +- **Autonomous Pipeline + Optimization Loop:** Auto-iterate on prompts using eval scores +- **Single-Agent Loop + Multi-Agent System:** Individual coding agents orchestrated by a planner for large projects +- **Autonomous Pipeline + Multi-Agent System:** Autonomous pipeline with specialized roles (validate-agent, design-agent, code-agent) + +When combining, keep the management model simple. A 2-level structure (planner + workers) outperforms deeper hierarchies. diff --git a/plugins/agentic-architect/skills/agent-architecture-design/references/workflow-patterns.md b/plugins/agentic-architect/skills/agent-architecture-design/references/workflow-patterns.md new file mode 100644 index 000000000..1465ce111 --- /dev/null +++ b/plugins/agentic-architect/skills/agent-architecture-design/references/workflow-patterns.md @@ -0,0 +1,105 @@ +# Workflow Patterns by Framework + +Patterns from reference frameworks for designing agent workflows, organized by agentic design pattern. + +## OpenSpec (OPSX Conventions) + +**Source:** [OpenSpec](https://github.com/Fission-AI/OpenSpec) + +**Best for:** Autonomous Pipeline and Multi-Agent System + +**Core concept:** Artifact-driven dependency graph. Commands chain through file existence, not sequential phases. + +**Default workflow (spec-driven):** +``` +/opsx:explore → /opsx:propose → /opsx:apply → /opsx:archive +``` + +**Expanded workflow:** +``` +/opsx:new → /opsx:continue (×N) → /opsx:apply → /opsx:verify → /opsx:archive +``` + +**Key patterns:** +- **Artifact gates** — Each phase produces a file. Next phase checks file exists before starting. +- **Delta specs** — Changes are expressed as ADDED/MODIFIED/REMOVED operations on existing specs, not full rewrites. +- **Fast-forward** (`/opsx:ff`) — Generate all planning artifacts at once for clear-scope work. +- **Schema-configurable** — Workflow phases defined in `schema.yaml` as a DAG, not hardcoded. +- **Archive merges deltas** — Completed changes are merged back into main specs, keeping specs as source of truth. + +**Artifact types:** +| Artifact | Purpose | +|---|---| +| `proposal.md` | WHAT and WHY (scope, non-goals, acceptance criteria) | +| `specs/*.md` | Behavior contracts with Given/When/Then scenarios | +| `design.md` | HOW (technical approach, decisions, risks) | +| `tasks.md` | Implementation checklist with checkboxes | +| `verify-report.md` | Verification results and traceability | + +--- + +## Superpowers + +**Source:** [Superpowers](https://github.com/obra/superpowers/) + +**Best for:** Single-Agent Iterative Loop and Autonomous Pipeline + +**Core concept:** Skills as workflow phases with hard gates and mandatory skill checks. + +**Workflow phases:** +1. Brainstorming — Explore requirements before committing +2. Writing Plans — Task decomposition +3. Executing Plans / Subagent-Driven Development — Implementation +4. Test-Driven Development — RED-GREEN-REFACTOR during implementation +5. Requesting Code Review — Verification +6. Finishing a Development Branch — Completion + +**Key patterns:** +- **``** — Synchronization points that prevent progression without explicit checks. Agent must verify conditions before proceeding. +- **The 1% Rule** — If there's even a 1% chance a skill applies, invoke it. Prevents agents from rationalizing past important steps. +- **`SUBAGENT-STOP`** — Prevents subagents from invoking full skill workflows when executing specific tasks. +- **Brainstorming before planning** — Always explore intent and requirements before committing to a plan. +- **Two-stage code review** — Spec compliance review then code quality review (not one combined review). + +--- + +## Compound Engineering + +**Source:** [Compound Engineering](https://github.com/EveryInc/compound-engineering-plugin) + +**Best for:** Autonomous Pipeline with learning loop + +**Core concept:** Four-phase repeating cycle where learnings compound across iterations. + +**Workflow:** +``` +/ce:plan → /ce:work → /ce:review → /ce:compound → repeat +``` + +**Key patterns:** +- **Compounding loop** (`/ce:compound`) — After each cycle, document what worked and what didn't. Feed learnings into future planning. Each cycle gets easier. +- **Autonomous modes:** + - `/lfg` (Let's Go) — Sequential full cycle + - `/slfg` (Swarm LFG) — Parallel execution during review/testing +- **Multi-agent review** — Review phase dispatches multiple agents for parallel code review. +- **Knowledge accumulation** — Solutions documented in the compound phase become reusable patterns. + +--- + +## Framework Selection by Design Pattern + +| Agentic Design Pattern | Primary Framework | Secondary Framework | +|---|---|---| +| Single-Agent Iterative Loop | Superpowers (brainstorm → plan → TDD) | — | +| Autonomous Pipeline | OpenSpec (validate → propose → design → apply → verify) | Compound Engineering (learning loop) | +| Optimization Loop | Custom iteration loop (hypothesize → test → measure → decide) | — | +| Multi-Agent System | OpenSpec artifact gates + Superpowers hard gates | Compound Engineering (per-role learning) | + +## Universal Patterns (All Architectures) + +1. **Hard gates** — Check prerequisites before proceeding. Never silently skip. +2. **Artifact persistence** — Write phase outputs to disk, not just conversation context. Enables cross-session resumption. +3. **Workflow state metadata** — Track which phases are complete in a YAML file alongside artifacts. +4. **Error handling** — Standardize retry policy. Clear failure messages naming what to fix. +5. **Trivial escape hatch** — Document when it's OK to skip phases (small fixes, config changes). +6. **Artifact self-correction** — Downstream phases can fix factual errors in upstream artifacts, logged in a corrections section. diff --git a/plugins/agentic-architect/skills/agent-plugin-review/SKILL.md b/plugins/agentic-architect/skills/agent-plugin-review/SKILL.md new file mode 100644 index 000000000..3be083561 --- /dev/null +++ b/plugins/agentic-architect/skills/agent-plugin-review/SKILL.md @@ -0,0 +1,102 @@ +--- +name: agent-plugin-review +description: >- + Use when reviewing an AI plugin pull request, auditing plugin quality before release, + or when asked to "review a plugin PR", "review skills in this PR", "check plugin quality", + or "review workflow architecture". Covers skill quality, structural linting, and workflow + architecture review. +--- + +# Plugin Review + +## Overview + +Review AI plugin PRs by running deterministic structural checks first, then applying LLM judgment for skill quality and workflow architecture. Post findings as inline PR comments. + +## Process + +### Step 1: Structural lint + +Run `scripts/lint_plugin.py` against the plugin directory: + +```bash +python scripts/lint_plugin.py --evals-dir --json +``` + +The script checks: +- Every `skills/*/SKILL.md` has a corresponding eval file +- SKILL.md frontmatter has `name` and `description` +- No hardcoded local paths (drive letters, absolute OS paths) +- No version printing instructions +- Referenced files (`references/*.md`) exist +- Commands reference existing skills +- Path style consistency across commands + +Report findings grouped by severity (error > warning > info). + +### Step 2: Eval lint + +If the PR includes eval files, invoke `agentv-eval-review` for AgentV-specific eval quality checks. + +### Step 3: Skill quality review (LLM judgment) + +For each SKILL.md, check against `references/skill-quality-checklist.md`: + +- Description starts with "Use when..." and describes triggering conditions only (not workflow) +- Description does NOT summarize the skill's process — this causes agents to follow the description instead of reading the SKILL.md body +- Body is concise — only include what the agent doesn't already know +- Imperative/infinitive form, not second person +- Heavy reference (100+ lines) moved to `references/` files +- One excellent code example beats many mediocre ones +- Flowcharts only for non-obvious decisions +- Keywords throughout for search discovery +- Cross-references use skill name with requirement markers, not `@` force-load syntax +- Discipline-enforcing skills have rationalization tables, red flags lists, and explicit loophole closures +- Consistency — no contradictions within or across files (tool names, filenames, commands, rules) +- No manual routing workarounds — if AGENTS.md or instruction files contain heavy TRIGGER/ACTION routing tables or skill-chain logic, the skill descriptions are likely too weak. Good descriptions enable auto-discovery without manual routing. + +### Step 4: Workflow architecture review (LLM judgment) + +For plugins with multi-phase workflows, check against `references/workflow-checklist.md`: + +- Hard gates between phases (artifact existence checks) +- Artifact persistence convention (defined output directory) +- Workflow state metadata for cross-session resumption +- Resumption protocol (detect existing artifacts, skip completed phases) +- Standardized error handling with retry +- Trivial change escape hatch +- Artifact self-correction with corrections log +- Learning loop mechanism + +### Step 5: Post review + +Post findings as inline PR comments at specific line numbers. Group by severity: +- **Critical** — Broken references, missing evals, factual contradictions, missing hard gates +- **Medium** — Naming inconsistencies, hardcoded paths, missing assertions, ad-hoc error handling +- **Low** — Style inconsistencies, description improvements + +Use a PR review (not individual comments) to batch all findings. + +## Skill Resources + +- `scripts/lint_plugin.py` — Deterministic plugin linter (Python 3.11+, stdlib only) +- `references/skill-quality-checklist.md` — Skill quality checklist (CSO, descriptions, content, discipline skills) +- `references/workflow-checklist.md` — Workflow architecture checklist (OpenSpec, hard gates, artifacts) + +## External References + +For deeper research on challenging reviews, consult these resources via web fetch, deepwiki, or clone the repo locally: + +- [Agent Skills specification](https://agentskills.io/specification) — Official SKILL.md format, frontmatter fields, progressive disclosure rules +- [Agent Skills best practices](https://agentskills.io/skill-creation/best-practices) — Context spending, calibrating control, gotchas, scripts, validation loops +- [Agent Skills description optimization](https://agentskills.io/skill-creation/optimizing-descriptions) — Trigger testing, train/validation splits, overfitting avoidance +- [Agent Skills using scripts](https://agentskills.io/skill-creation/using-scripts) — Self-contained scripts, --help, structured output, idempotency, exit codes +- [AgentV documentation](https://agentv.dev/) — Eval YAML schema, assertion types, workspace evals, multi-provider targets +- [OpenSpec](https://github.com/Fission-AI/OpenSpec) — Spec-driven development framework (OPSX conventions, artifact graphs, hard gates, delta specs) +- [Superpowers](https://github.com/obra/superpowers/) — Claude Code plugin with `` pattern, brainstorming workflow, skill-based development phases +- [Compound Engineering](https://github.com/EveryInc/compound-engineering-plugin) — Four-phase workflow (Plan/Work/Review/Compound) with learning loop pattern + +## Related Skills + +- **agentv-eval-review** — Lint and review AgentV eval files (invoke for eval-specific checks) +- **agent-architecture-design** — Design agent architectures from scratch diff --git a/plugins/agentic-architect/skills/agent-plugin-review/references/skill-quality-checklist.md b/plugins/agentic-architect/skills/agent-plugin-review/references/skill-quality-checklist.md new file mode 100644 index 000000000..1a8f279ca --- /dev/null +++ b/plugins/agentic-architect/skills/agent-plugin-review/references/skill-quality-checklist.md @@ -0,0 +1,125 @@ +# Skill Quality Checklist + +Derived from [Superpowers writing-skills](https://github.com/obra/superpowers/) and [Anthropic's skill authoring best practices](https://docs.anthropic.com/en/docs/agents-and-tools/agent-skills). + +## Frontmatter + +- [ ] Only two fields: `name` and `description` (no other fields supported) +- [ ] Max 1024 characters total in frontmatter +- [ ] `name` uses only letters, numbers, and hyphens (no parentheses, special chars) +- [ ] `description` written in third person +- [ ] `description` starts with "Use when..." focusing on triggering conditions +- [ ] `description` describes WHEN to use, NOT WHAT the skill does +- [ ] `description` does NOT summarize the skill's workflow or process + +### Why description must not summarize workflow + +Testing revealed that when a description summarizes the skill's workflow, Claude may follow the description instead of reading the full SKILL.md content. A description saying "code review between tasks" caused Claude to do ONE review, even though the SKILL.md flowchart clearly showed TWO reviews. When the description was changed to just triggering conditions, Claude correctly read and followed the full skill. + +### Description examples + +```yaml +# BAD: Summarizes workflow - Claude may follow this instead of reading skill +description: Use when executing plans - dispatches subagent per task with code review between tasks + +# BAD: Too much process detail +description: Use for TDD - write test first, watch it fail, write minimal code, refactor + +# BAD: Too abstract, vague +description: For async testing + +# BAD: First person +description: I can help you with async tests when they're flaky + +# GOOD: Just triggering conditions, no workflow summary +description: Use when executing implementation plans with independent tasks in the current session + +# GOOD: Triggering conditions only +description: Use when implementing any feature or bugfix, before writing implementation code + +# GOOD: Problem-focused, technology-agnostic +description: Use when tests have race conditions, timing dependencies, or pass/fail inconsistently +``` + +## Content Quality + +### Conciseness (Claude Search Optimization) + +- [ ] SKILL.md body is concise — only include what Claude doesn't already know +- [ ] Challenge each paragraph: "Does Claude really need this explanation?" +- [ ] Target word counts: + - Frequently-loaded skills: < 200 words + - Standard skills: < 500 words + - With references: SKILL.md lean, details in reference files +- [ ] Move heavy reference (100+ lines) to separate files +- [ ] Use cross-references instead of repeating content from other skills +- [ ] Compress examples — one excellent example beats many mediocre ones + +### Structure + +- [ ] Overview: core principle in 1-2 sentences +- [ ] When to Use: symptoms and use cases (flowchart only if decision is non-obvious) +- [ ] When NOT to use: explicit exclusions +- [ ] Core Pattern: before/after comparison (for techniques/patterns) +- [ ] Quick Reference: table or bullets for scanning +- [ ] Common Mistakes: what goes wrong + fixes +- [ ] Inline code for simple patterns, separate file for heavy reference + +### Writing Style + +- [ ] Imperative/infinitive form (verb-first instructions) +- [ ] NOT second person ("you should...") +- [ ] Technology-agnostic triggers unless skill is technology-specific +- [ ] Keywords throughout for search discovery (error messages, symptoms, synonyms, tool names) + +### Degrees of Freedom + +Match specificity to the task's fragility: + +| Freedom Level | When to Use | Example | +|---|---|---| +| High (text instructions) | Multiple valid approaches, context-dependent | Code review process | +| Medium (pseudocode/templates) | Preferred pattern exists, some variation OK | Report generation | +| Low (exact scripts) | Precise steps required, fragile operations | Database migration | + +## File Organization + +- [ ] Flat namespace — all skills in one searchable directory +- [ ] Supporting files only for: heavy reference (100+ lines), reusable tools/scripts +- [ ] Everything else inline in SKILL.md +- [ ] No narrative storytelling ("In session 2025-10-03, we found...") +- [ ] No multi-language dilution (one excellent example, not 5 mediocre ones) + +## Flowchart Usage + +- [ ] Use ONLY for non-obvious decision points, process loops, "A vs B" decisions +- [ ] Never use for: reference material (→ tables), code (→ code blocks), linear instructions (→ numbered lists) +- [ ] Labels must have semantic meaning (not "step1", "helper2") + +## Cross-References + +- [ ] Use skill name with explicit requirement markers: `**REQUIRED:** Use skill-name` +- [ ] Do NOT use `@` syntax to force-load files (burns context) +- [ ] Do NOT repeat content available in referenced skills + +## Anti-Patterns to Flag + +| Anti-Pattern | Why It's Bad | +|---|---| +| Narrative examples ("In session X, we found...") | Too specific, not reusable | +| Multi-language examples (JS, Python, Go, etc.) | Mediocre quality, maintenance burden | +| Code in flowcharts | Can't copy-paste, hard to read | +| Generic labels (helper1, step2) | No semantic meaning | +| Version printing instructions | Fragile, rely on git history | +| Hardcoded local paths | Machine-specific, not portable | +| Description summarizes workflow | Claude follows description, skips SKILL.md body | + +## Discipline-Enforcing Skills (Additional Checks) + +For skills that enforce rules (TDD, verification, coding standards): + +- [ ] Specific workarounds explicitly forbidden (not just "don't do X" but "don't keep it as reference, don't adapt it, delete means delete") +- [ ] Rationalization table present (common excuses + reality) +- [ ] Red flags list for self-checking +- [ ] "Spirit vs letter" addressed: "Violating the letter IS violating the spirit" +- [ ] Hard gates at critical decision points diff --git a/plugins/agentic-architect/skills/agent-plugin-review/references/workflow-checklist.md b/plugins/agentic-architect/skills/agent-plugin-review/references/workflow-checklist.md new file mode 100644 index 000000000..c5f3fa1f4 --- /dev/null +++ b/plugins/agentic-architect/skills/agent-plugin-review/references/workflow-checklist.md @@ -0,0 +1,78 @@ +# Workflow Architecture Checklist + +Review multi-phase plugin workflows against these patterns, derived from [OpenSpec](https://github.com/Fission-AI/OpenSpec) (OPSX conventions), [Superpowers](https://github.com/obra/superpowers/), and [Compound Engineering](https://github.com/EveryInc/compound-engineering-plugin). + +## Phase Coverage + +Compare the plugin's workflow phases against the OpenSpec artifact model: + +| OpenSpec Phase | OPSX Command | Expected Plugin Equivalent | +|---|---|---| +| Explore | `/opsx:explore` | Research mode — investigate without creating artifacts | +| Validate | (custom) | Check requirements against real codebase before design | +| Propose | `/opsx:propose` | Define WHAT and WHY with acceptance criteria | +| Design | (via schema) | Plan HOW — file-level changes, multi-repo coordination | +| Tasks | (via schema) | Standalone `tasks.md` with `- [ ]` checkboxes | +| Apply | `/opsx:apply` | Implement through task checklist with TDD | +| Verify | `/opsx:verify` | Build + test + trace implementation back to specs | +| Archive | `/opsx:archive` | Finalize, merge deltas, persist learnings | + +Not all phases are required for every plugin. Flag missing phases only when the gap would cause real problems. + +## Hard Gates + +From [Superpowers](https://github.com/obra/superpowers/) `` pattern: + +- [ ] Each phase checks for prerequisite artifacts before proceeding +- [ ] Gate failure message tells the user which command/skill to run first +- [ ] Gates cannot be silently bypassed +- [ ] Gate checks happen at the start of the skill, before any work + +Example gate: +``` +HARD GATE: `hld-review.md` MUST exist in {output_dir}/. +If missing, inform the user: "Run the design-review skill first." STOP. +``` + +## Artifact Contracts + +- [ ] Each phase produces a defined output artifact (e.g., `context.md`, `design.md`, `tasks.md`) +- [ ] Output format of phase N matches expected input of phase N+1 +- [ ] Artifact location convention is defined (not just `{output_dir}/`) +- [ ] Artifacts persist to disk (not just conversation context) for cross-session resumption + +## Workflow State + +- [ ] Workflow state tracked in a metadata file (e.g., `.workflow.yaml`) alongside artifacts +- [ ] Metadata records: which phases are complete, timestamps, WI/issue number +- [ ] Resumption protocol detects existing artifacts and skips completed phases +- [ ] Partial completion is handled (e.g., Phase 4 with N-1 of N agents succeeding) + +## Error Handling + +- [ ] Standardized retry policy across all skills (e.g., retry MCP calls 3x with exponential backoff) +- [ ] Clear failure reporting — user knows what failed and what to do next +- [ ] Errors don't silently corrupt downstream phases +- [ ] Critical failures (P0 findings, merge conflicts) stop the workflow + +## Escape Hatches + +- [ ] Trivial change escape: small fixes can skip spec phases +- [ ] Criteria for "trivial" are documented (e.g., < 20 lines, single file, no schema change) +- [ ] Artifact self-correction: downstream phases can fix factual errors in upstream artifacts +- [ ] Corrections are logged (e.g., `## Corrections Log` section) for auditability + +## Learning Loop + +From [Compound Engineering](https://github.com/EveryInc/compound-engineering-plugin) `/ce:compound` pattern: + +- [ ] Mechanism exists to capture patterns from completed work +- [ ] Learnings feed back into future workflow runs (e.g., review guidelines, common patterns) +- [ ] Learning artifacts are version-controlled and mergeable + +## Fast-Forward Mode + +From OpenSpec `/opsx:ff`: + +- [ ] For well-understood changes, all planning artifacts can be generated in one pass +- [ ] Fast-forward mode is optional — users can still step through phases individually diff --git a/plugins/agentic-architect/skills/agent-plugin-review/scripts/lint_plugin.py b/plugins/agentic-architect/skills/agent-plugin-review/scripts/lint_plugin.py new file mode 100644 index 000000000..15f0be189 --- /dev/null +++ b/plugins/agentic-architect/skills/agent-plugin-review/scripts/lint_plugin.py @@ -0,0 +1,198 @@ +#!/usr/bin/env python3 +"""Lint AI plugin structure for common issues. + +Usage: python lint_plugin.py [--evals-dir ] [--json] + +Checks: + - Every skills/*/SKILL.md has a corresponding eval file + - SKILL.md frontmatter has name and description + - No hardcoded local paths (drive letters, absolute OS paths) + - No version printing instructions + - Commands reference existing skills + - Path style consistency across commands + - Referenced files (references/*.md) exist + +Exit code: 0 if no issues, 1 if issues found. +""" + +import json +import os +import re +import sys +from pathlib import Path + + +def find_skills(plugin_dir: Path) -> list[Path]: + """Find all SKILL.md files in the plugin.""" + return sorted(plugin_dir.rglob("skills/*/SKILL.md")) + + +def find_evals(evals_dir: Path, plugin_name: str) -> list[Path]: + """Find eval files for a plugin.""" + plugin_evals = evals_dir / plugin_name + if not plugin_evals.exists(): + return [] + return sorted(plugin_evals.rglob("*.yaml")) + sorted(plugin_evals.rglob("*.yml")) + + +def find_commands(plugin_dir: Path) -> list[Path]: + """Find command files.""" + commands_dir = plugin_dir / "commands" + if not commands_dir.exists(): + return [] + return sorted(commands_dir.glob("*.md")) + + +def lint_plugin(plugin_dir: Path, evals_dir: Path | None = None) -> list[dict]: + issues = [] + + def issue(severity: str, msg: str, file: str | None = None, line: int | None = None): + issues.append({ + "file": file or str(plugin_dir), + "severity": severity, + "message": msg, + "line": line, + }) + + plugin_name = plugin_dir.name + skills = find_skills(plugin_dir) + commands = find_commands(plugin_dir) + + # Collect skill names + skill_names = set() + for skill_path in skills: + skill_name = skill_path.parent.name + skill_names.add(skill_name) + + # Check each SKILL.md + for skill_path in skills: + skill_name = skill_path.parent.name + text = skill_path.read_text(encoding="utf-8") + lines = text.splitlines() + + # Check frontmatter + if not text.startswith("---"): + issue("error", "Missing YAML frontmatter", str(skill_path)) + else: + fm_end = text.find("---", 3) + if fm_end == -1: + issue("error", "Unclosed YAML frontmatter", str(skill_path)) + else: + fm = text[3:fm_end] + if "name:" not in fm: + issue("error", "Frontmatter missing 'name' field", str(skill_path)) + if "description:" not in fm: + issue("error", "Frontmatter missing 'description' field", str(skill_path)) + + # Check for hardcoded paths + drive_letter_pat = re.compile(r'[A-Z]:\\[A-Za-z]') + for i, line in enumerate(lines, 1): + if drive_letter_pat.search(line): + # Skip if it's in a table header or obviously an example + if "Override" not in line and "Example" not in line: + issue("warning", f"Hardcoded local path detected", str(skill_path), i) + + # Check for version printing + version_pat = re.compile(r'print.*version|version \d{8}', re.IGNORECASE) + for i, line in enumerate(lines, 1): + if version_pat.search(line): + issue("warning", "Version printing instruction — rely on git history", str(skill_path), i) + + # Check referenced files exist + ref_pat = re.compile(r'`(references/[^`]+)`') + skill_dir = skill_path.parent + for i, line in enumerate(lines, 1): + for match in ref_pat.finditer(line): + ref_path = skill_dir / match.group(1) + if not ref_path.exists(): + issue("error", f"Referenced file does not exist: {match.group(1)}", str(skill_path), i) + + # Check for non-existent command references + cmd_pat = re.compile(r'/([a-z][a-z0-9-]+)') + cmd_names = {c.stem for c in commands} + for i, line in enumerate(lines, 1): + for match in cmd_pat.finditer(line): + cmd_ref = match.group(1) + # Skip common false positives + if cmd_ref in ("dev", "null", "tmp", "etc", "usr", "bin", "opsx"): + continue + if cmd_ref.startswith("opsx:") or cmd_ref.startswith("ce:"): + continue + if cmd_ref not in cmd_names and cmd_ref not in skill_names: + # Only flag if it looks like a slash command (preceded by whitespace or start of line) + before = line[:match.start()].rstrip() + if before == "" or before.endswith((" ", "\t", '"', "'", ":")): + issue("info", f"References /{cmd_ref} — not found in commands/ or skills/", str(skill_path), i) + + # Check eval coverage + if evals_dir: + eval_files = find_evals(evals_dir, plugin_name) + eval_stems = set() + for ef in eval_files: + stem = ef.stem.replace(".eval", "") + eval_stems.add(stem) + + for skill_name in sorted(skill_names): + # Check various naming patterns + has_eval = ( + skill_name in eval_stems + or skill_name.replace(plugin_name + "-", "") in eval_stems + or any(skill_name in s for s in eval_stems) + ) + if not has_eval: + issue("warning", f"Skill '{skill_name}' has no corresponding eval file", str(plugin_dir / "skills" / skill_name / "SKILL.md")) + + # Check command path consistency + path_styles = set() + for cmd_path in commands: + text = cmd_path.read_text(encoding="utf-8") + if "plugins/" in text: + path_styles.add("absolute") + if re.search(r'skills/[a-z]', text) and "plugins/" not in text.split("skills/")[0][-20:]: + path_styles.add("relative") + if len(path_styles) > 1: + issue("info", "Commands use mixed path styles (some relative, some absolute)", str(plugin_dir / "commands")) + + return issues + + +def main(): + if len(sys.argv) < 2: + print(f"Usage: {sys.argv[0]} [--evals-dir ] [--json]", file=sys.stderr) + sys.exit(2) + + plugin_dir = Path(sys.argv[1]) + output_json = "--json" in sys.argv + + evals_dir = None + if "--evals-dir" in sys.argv: + idx = sys.argv.index("--evals-dir") + if idx + 1 < len(sys.argv): + evals_dir = Path(sys.argv[idx + 1]) + + if not plugin_dir.is_dir(): + print(f"Error: {plugin_dir} is not a directory", file=sys.stderr) + sys.exit(2) + + issues = lint_plugin(plugin_dir, evals_dir) + + if output_json: + print(json.dumps(issues, indent=2)) + else: + for iss in issues: + line = f":{iss['line']}" if iss.get("line") else "" + print(f"[{iss['severity'].upper()}] {iss['file']}{line}: {iss['message']}") + + counts = {} + for iss in issues: + counts[iss["severity"]] = counts.get(iss["severity"], 0) + 1 + if issues: + print(f"\n{len(issues)} issues: {', '.join(f'{v} {k}' for k, v in sorted(counts.items()))}") + else: + print("No issues found.") + + sys.exit(1 if any(i["severity"] == "error" for i in issues) else 0) + + +if __name__ == "__main__": + main() diff --git a/plugins/agentv-dev/skills/agentv-eval-review/SKILL.md b/plugins/agentv-dev/skills/agentv-eval-review/SKILL.md new file mode 100644 index 000000000..23e2c3466 --- /dev/null +++ b/plugins/agentv-dev/skills/agentv-eval-review/SKILL.md @@ -0,0 +1,52 @@ +--- +name: agentv-eval-review +description: >- + Use when reviewing eval YAML files for quality issues, linting eval files before + committing, checking eval schema compliance, or when asked to "review these evals", + "check eval quality", "lint eval files", or "validate eval structure". + Do NOT use for writing evals (use agentv-eval-writer) or running evals (use agentv-bench). +--- + +# Eval Review + +## Overview + +Lint and review AgentV eval YAML files for structural issues, schema compliance, and quality problems. Runs deterministic checks via script, then applies LLM judgment for semantic issues the script cannot catch. + +## Process + +### Step 1: Run the linter + +Execute `scripts/lint_eval.py` against the target eval files: + +```bash +python scripts/lint_eval.py --json +``` + +The script checks: +- `.eval.yaml` extension +- `description` field present +- Each test has `id`, `input`, and at least one of `criteria`/`expected_output`/`assertions` +- File paths in `type: file` use leading `/` +- `assertions` blocks present (flags tests relying solely on `expected_output`) +- `expected_output` prose detection (flags "The agent should..." patterns) +- Repeated file inputs across tests (recommends top-level `input`) +- Naming prefix consistency across eval files in same directory + +### Step 2: Review script output + +Report the script findings grouped by severity (error > warning > info). For each finding, include the file path and a concrete fix. + +### Step 3: Semantic review (LLM judgment) + +The script catches structural issues but cannot assess: +- **Factual accuracy** — Do tool/command names in expected_output match what the skill documents? +- **Coverage gaps** — Are important edge cases missing? +- **Assertion discriminability** — Would assertions pass for both good and bad output? +- **Cross-file consistency** — Do output filenames match across evals and skills? + +Read the relevant SKILL.md files and cross-check against the eval content for these issues. + +## Skill Resources + +- `scripts/lint_eval.py` — Deterministic eval linter (Python 3.11+, stdlib only) diff --git a/plugins/agentv-dev/skills/agentv-eval-review/scripts/lint_eval.py b/plugins/agentv-dev/skills/agentv-eval-review/scripts/lint_eval.py new file mode 100644 index 000000000..1ba450885 --- /dev/null +++ b/plugins/agentv-dev/skills/agentv-eval-review/scripts/lint_eval.py @@ -0,0 +1,239 @@ +#!/usr/bin/env python3 +"""Lint AgentV eval YAML files for common issues. + +Usage: python lint_eval.py [--json] + +Checks: + - File uses .eval.yaml extension + - description field present + - Each test has id, input, criteria + - File paths in type:file use leading / + - assertions blocks present (not relying solely on expected_output) + - expected_output does not contain evaluation criteria prose + - Repeated file inputs across tests (should use top-level input) + - Naming prefix consistency across eval files in same directory + +Exit code: 0 if no issues, 1 if issues found. +""" + +import json +import os +import re +import sys +from pathlib import Path + +try: + import yaml +except ImportError: + # Fall back to basic YAML parsing if PyYAML not available + yaml = None + + +def parse_yaml_basic(text: str) -> dict: + """Minimal YAML-ish parser for when PyYAML is unavailable.""" + # This is a best-effort fallback; recommend installing PyYAML + import ast + # Try json first (YAML is a superset of JSON) + try: + return json.loads(text) + except Exception: + pass + return {} + + +def load_yaml(path: Path) -> dict: + text = path.read_text(encoding="utf-8") + if yaml: + return yaml.safe_load(text) or {} + return parse_yaml_basic(text) + + +def lint_file(path: Path) -> list[dict]: + issues = [] + + def issue(severity: str, msg: str, line: int | None = None): + issues.append({"file": str(path), "severity": severity, "message": msg, "line": line}) + + # Check extension + if not path.name.endswith(".eval.yaml"): + issue("error", f"File should use .eval.yaml extension, got: {path.name}") + + try: + data = load_yaml(path) + except Exception as e: + issue("error", f"Failed to parse YAML: {e}") + return issues + + if not isinstance(data, dict): + issue("error", "Root element is not a mapping") + return issues + + # Check description + if "description" not in data: + issue("warning", "Missing top-level 'description' field") + + tests = data.get("tests", []) + if not isinstance(tests, list): + issue("error", "'tests' is not a list") + return issues + + if not tests: + issue("warning", "No tests defined") + return issues + + # Check for top-level input (shared file references) + top_level_input = data.get("input") + + # Collect file values across tests to detect repetition + file_values_per_test: list[list[str]] = [] + + for i, test in enumerate(tests): + test_id = test.get("id", f"test-{i}") + + if "id" not in test: + issue("error", f"Test at index {i} missing 'id'") + + if "input" not in test and top_level_input is None: + issue("error", f"Test '{test_id}' missing 'input' and no top-level input defined") + + has_criteria = "criteria" in test + has_expected = "expected_output" in test + has_assertions = "assertions" in test + + if not has_criteria and not has_expected and not has_assertions: + issue("error", f"Test '{test_id}' needs at least one of: criteria, expected_output, assertions") + + # Check assertions present + if not has_assertions and has_expected: + issue("warning", f"Test '{test_id}' has expected_output but no assertions — add deterministic assertions where possible") + + # Check expected_output for prose patterns + if has_expected: + expected = test["expected_output"] + expected_text = "" + if isinstance(expected, str): + expected_text = expected + elif isinstance(expected, list): + for msg in expected: + if isinstance(msg, dict): + content = msg.get("content", "") + if isinstance(content, str): + expected_text += content + + prose_patterns = [ + r"[Tt]he agent should", + r"[Ss]hould identify", + r"[Ss]hould flag", + r"[Ss]hould recommend", + r"[Ss]hould produce", + r"[Ss]hould detect", + r"[Ss]hould load", + r"[Ss]hould run", + ] + for pat in prose_patterns: + if re.search(pat, expected_text): + issue("warning", f"Test '{test_id}' expected_output contains evaluation criteria prose ('{pat.lstrip('[Tt]').lstrip('[Ss]')}...') — use criteria or assertions instead") + break + + # Collect file values from input + test_files = extract_file_values(test.get("input", [])) + file_values_per_test.append(test_files) + + # Check file paths for leading / + for fv in test_files: + if not fv.startswith("/"): + issue("warning", f"Test '{test_id}' file path missing leading '/': {fv}") + + # Check for repeated file inputs + if len(file_values_per_test) >= 2 and not top_level_input: + common_files = set(file_values_per_test[0]) + for fvs in file_values_per_test[1:]: + common_files &= set(fvs) + if common_files: + issue("info", f"File input repeated in every test: {', '.join(sorted(common_files))} — consider using top-level input") + + return issues + + +def extract_file_values(input_data) -> list[str]: + """Extract type:file values from input structure.""" + files = [] + if isinstance(input_data, list): + for item in input_data: + if isinstance(item, dict): + content = item.get("content", []) + if isinstance(content, list): + for c in content: + if isinstance(c, dict) and c.get("type") == "file": + v = c.get("value", "") + if v: + files.append(v) + return files + + +def lint_directory(path: Path) -> list[dict]: + issues = [] + eval_files = sorted(path.rglob("*.yaml")) + sorted(path.rglob("*.yml")) + + if not eval_files: + issues.append({"file": str(path), "severity": "warning", "message": "No eval files found", "line": None}) + return issues + + # Check naming prefix consistency + prefixes = set() + for f in eval_files: + name = f.stem.replace(".eval", "") + parts = name.split("-") + if len(parts) >= 2: + prefixes.add(parts[0]) + + if len(prefixes) > 1: + issues.append({ + "file": str(path), + "severity": "info", + "message": f"Inconsistent naming prefixes: {', '.join(sorted(prefixes))}", + "line": None, + }) + + for f in eval_files: + issues.extend(lint_file(f)) + + return issues + + +def main(): + if len(sys.argv) < 2: + print(f"Usage: {sys.argv[0]} [--json]", file=sys.stderr) + sys.exit(2) + + target = Path(sys.argv[1]) + output_json = "--json" in sys.argv + + if target.is_file(): + issues = lint_file(target) + elif target.is_dir(): + issues = lint_directory(target) + else: + print(f"Error: {target} not found", file=sys.stderr) + sys.exit(2) + + if output_json: + print(json.dumps(issues, indent=2)) + else: + for iss in issues: + line = f":{iss['line']}" if iss.get("line") else "" + print(f"[{iss['severity'].upper()}] {iss['file']}{line}: {iss['message']}") + + counts = {} + for iss in issues: + counts[iss["severity"]] = counts.get(iss["severity"], 0) + 1 + if issues: + print(f"\n{len(issues)} issues: {', '.join(f'{v} {k}' for k, v in sorted(counts.items()))}") + else: + print("No issues found.") + + sys.exit(1 if any(i["severity"] == "error" for i in issues) else 0) + + +if __name__ == "__main__": + main()