EntityProcess · christso · Mar 25, 2026 · Mar 25, 2026 · Mar 25, 2026 · Mar 25, 2026
diff --git a/.agentv/targets.yaml b/.agentv/targets.yaml
@@ -10,11 +10,21 @@ targets:
     system_prompt: "Answer directly based on the information provided."
     grader_target: gemini-flash
 
+  - name: pi-cli
+    provider: pi-cli
+    grader_target: gemini-flash
+
+  - name: pi-coding-agent
+    provider: pi-coding-agent
+    subprovider: openrouter
+    model: z-ai/glm-4.7
+    api_key: ${{ OPENROUTER_API_KEY }}
+    system_prompt: "Answer directly based on the information provided."
+    grader_target: gemini-flash
+
   - name: codex
     provider: codex
     grader_target: gemini-llm
-    cwd: ${{ CODEX_WORKSPACE_DIR }}            # Where scratch workspaces are created
-    log_dir: ${{ CODEX_LOG_DIR }}              # Optional: where Codex CLI stream logs are stored (defaults to ./.agentv/logs/codex)
     log_format: json                    # Optional: 'summary' (default) or 'json' for raw event logs
 
   - name: gemini-llm

diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json
@@ -16,6 +16,11 @@
       "name": "agentv-claude-trace",
       "description": "Session tracing plugin — exports Claude Code session traces via OpenTelemetry",
       "source": "./plugins/agentv-claude-trace"
+    },
+    {
+      "name": "agentic-architect",
+      "description": "Design and review AI agent architectures — species selection, workflow patterns, and plugin quality review",
+      "source": "./plugins/agentic-architect"
     }
   ]
 }
diff --git a/.github/plugin/marketplace.json b/.github/plugin/marketplace.json
@@ -16,6 +16,11 @@
       "name": "agentv-claude-trace",
       "description": "Session tracing plugin — exports Claude Code session traces via OpenTelemetry",
       "source": "./plugins/agentv-claude-trace"
+    },
+    {
+      "name": "agentic-architect",
+      "description": "Design and review AI agent architectures — species selection, workflow patterns, and plugin quality review",
+      "source": "./plugins/agentic-architect"
     }
   ]
 }
diff --git a/evals/agentic-architect/agent-plugin-review.eval.yaml b/evals/agentic-architect/agent-plugin-review.eval.yaml
@@ -0,0 +1,131 @@
+description: Evaluates that the agent-plugin-review skill is triggered and catches planted issues in a mock plugin
+
+execution:
+  targets:
+    - pi-cli
+
+workspace:
+  template: ./workspace-template
+
+tests:
+  - id: detect-missing-eval
+    criteria: Identifies that deploy-rollback skill has no corresponding eval file
+    input: |
+      Review the deploy-auto plugin in this repo for completeness.
+      Check that every skill has a corresponding eval file.
+    assertions:
+      - type: skill-trigger
+        value: agent-plugin-review
+      - type: contains
+        value: deploy-rollback
+      - type: rubrics
+        criteria:
+          - Flags that deploy-rollback skill has no corresponding eval file
+          - Does not flag deploy-plan or deploy-execute as missing evals
+
+  - id: detect-eval-naming
+    criteria: Identifies eval files using bare .yaml instead of .eval.yaml
+    input: |
+      Review the eval files under evals/deploy-auto/ for naming convention issues.
+    assertions:
+      - type: skill-trigger
+        value: agent-plugin-review
+      - type: contains
+        value: .eval.yaml
+      - type: rubrics
+        criteria:
+          - Flags deploy-plan.yaml as using wrong extension
+          - Recommends renaming to .eval.yaml
+          - Does not flag deploy-execute.eval.yaml
+
+  - id: detect-missing-assertions
+    criteria: Identifies eval tests without assertions that rely solely on expected_output prose
+    input: |
+      Review evals/deploy-auto/deploy-plan.yaml for eval quality issues.
+      Check assertion coverage and expected_output format.
+    assertions:
+      - type: skill-trigger
+        value: agent-plugin-review
+      - type: rubrics
+        criteria:
+          - Flags that no assertions are defined in deploy-plan.yaml
+          - Notes that expected_output contains evaluation criteria prose rather than sample responses
+          - Suggests adding deterministic assertions
+
+  - id: detect-relative-file-paths
+    criteria: Identifies eval file paths missing leading slash
+    input: |
+      Review evals/deploy-auto/deploy-plan.yaml for file path formatting issues.
+    assertions:
+      - type: skill-trigger
+        value: agent-plugin-review
+      - type: rubrics
+        criteria:
+          - Flags that file paths are missing a leading slash
+          - Shows the corrected path format with leading slash
+
+  - id: detect-repeated-inputs
+    criteria: Identifies eval files repeating the same file input in every test
+    input: |
+      Review evals/deploy-auto/deploy-plan.yaml for structural improvements.
+      Look at how inputs are organized across test cases.
+    assertions:
+      - type: skill-trigger
+        value: agent-plugin-review
+      - type: rubrics
+        criteria:
+          - Identifies the repeated SKILL.md file input across all 3 tests
+          - Recommends using top-level input for the shared file reference
+
+  - id: detect-missing-hard-gates
+    criteria: Identifies that deploy-execute has no hard gate checking for deploy-plan.md
+    input: |
+      Review the deploy-auto plugin's workflow architecture.
+      Check whether phases enforce prerequisites before proceeding.
+    assertions:
+      - type: skill-trigger
+        value: agent-plugin-review
+      - type: rubrics
+        criteria:
+          - Flags that deploy-execute does not check for deploy-plan.md before starting
+          - Recommends adding hard gates between phases
+          - Suggests stopping with a clear message if prerequisites are missing
+
+  - id: detect-factual-contradiction
+    criteria: Identifies that deploy-execute says pytest but its eval says python -m unittest
+    input: |
+      Review evals/deploy-auto/deploy-execute.eval.yaml for factual accuracy.
+      Cross-check expected outputs against what the skills actually document.
+    assertions:
+      - type: skill-trigger
+        value: agent-plugin-review
+      - type: rubrics
+        criteria:
+          - Flags the contradiction between pytest (skill) and python -m unittest (eval)
+          - Recommends updating the eval to match the skill
+
+  - id: detect-nonexistent-command-reference
+    criteria: Identifies that deploy-plan references /deploy-execute which is not a command
+    input: |
+      Review plugins/deploy-auto/skills/deploy-plan/SKILL.md for cross-reference issues.
+      Check that referenced commands and skills actually exist.
+    assertions:
+      - type: skill-trigger
+        value: agent-plugin-review
+      - type: rubrics
+        criteria:
+          - Flags that /deploy-execute is referenced but does not exist as a slash command
+          - Notes the distinction between skills and slash commands
+          - Suggests either creating the command or updating the handoff
+
+  - id: detect-hardcoded-paths
+    criteria: Identifies hardcoded local paths in deploy-execute skill
+    input: |
+      Review plugins/deploy-auto/skills/deploy-execute/SKILL.md for portability issues.
+    assertions:
+      - type: skill-trigger
+        value: agent-plugin-review
+      - type: rubrics
+        criteria:
+          - Flags the hardcoded path C:\Users\admin\.kube\config
+          - Recommends using environment variables or configurable defaults
diff --git a/evals/agentic-architect/workspace-template/evals/deploy-auto/deploy-execute.eval.yaml b/evals/agentic-architect/workspace-template/evals/deploy-auto/deploy-execute.eval.yaml
@@ -0,0 +1,48 @@
+description: Tests the deploy-execute skill
+
+tests:
+  - id: execute-plan
+    criteria: Executes deployment steps from deploy-plan.md
+    input:
+      - role: user
+        content:
+          - type: file
+            value: "/plugins/deploy-auto/skills/deploy-execute/SKILL.md"
+          - type: text
+            value: "Execute the deployment plan at ./output/deploy-plan.md"
+    assertions:
+      - type: rubrics
+        criteria:
+          - Reads the deployment plan file
+          - Executes steps in order
+          - Runs health checks after each step
+
+  - id: health-check-failure
+    criteria: Stops and rolls back on health check failure
+    input:
+      - role: user
+        content:
+          - type: file
+            value: "/plugins/deploy-auto/skills/deploy-execute/SKILL.md"
+          - type: text
+            value: "The API service health check is failing after deployment. What should happen?"
+    assertions:
+      - type: contains
+        value: rollback
+      - type: rubrics
+        criteria:
+          - Recommends executing the rollback command
+          - Stops the deployment pipeline
+
+  - id: run-tests
+    criteria: Runs integration tests after deployment
+    input:
+      - role: user
+        content:
+          - type: file
+            value: "/plugins/deploy-auto/skills/deploy-execute/SKILL.md"
+          - type: text
+            value: "Deployment is complete. Run the integration tests."
+    expected_output:
+      - role: assistant
+        content: "The agent should run the test suite using python -m unittest discover to verify the deployment."
diff --git a/evals/agentic-architect/workspace-template/evals/deploy-auto/deploy-plan.yaml b/evals/agentic-architect/workspace-template/evals/deploy-auto/deploy-plan.yaml
@@ -0,0 +1,41 @@
+description: Tests the deploy-plan skill
+
+tests:
+  - id: basic-plan
+    criteria: Creates a deployment plan from a release spec
+    input:
+      - role: user
+        content:
+          - type: file
+            value: "plugins/deploy-auto/skills/deploy-plan/SKILL.md"
+          - type: text
+            value: "Create a deployment plan for releasing v2.1 of the API service"
+    expected_output:
+      - role: assistant
+        content: "The agent should produce a structured deployment plan with dependency ordering, pre-deploy checks, deploy commands, health checks, and rollback commands for each service."
+
+  - id: multi-service-ordering
+    criteria: Orders deployments by dependency graph
+    input:
+      - role: user
+        content:
+          - type: file
+            value: "plugins/deploy-auto/skills/deploy-plan/SKILL.md"
+          - type: text
+            value: "Plan deployment for 3 services: frontend (depends on API), API (depends on database), database (no deps)"
+    expected_output:
+      - role: assistant
+        content: "The agent should order: database first, then API, then frontend."
+
+  - id: rollback-checkpoints
+    criteria: Includes rollback checkpoints for each step
+    input:
+      - role: user
+        content:
+          - type: file
+            value: "plugins/deploy-auto/skills/deploy-plan/SKILL.md"
+          - type: text
+            value: "The release has 4 services. Make sure I can rollback at any point."
+    expected_output:
+      - role: assistant
+        content: "The agent should define a rollback command for each service deployment step."
diff --git a/evals/agentic-architect/workspace-template/plugins/deploy-auto/AGENTS.md b/evals/agentic-architect/workspace-template/plugins/deploy-auto/AGENTS.md
@@ -0,0 +1,11 @@
+# Deploy Auto Plugin
+
+## Rule: Deployment Workflow
+
+TRIGGER: Working on a deployment or release
+ACTION: Follow the deploy pipeline. Use `/deploy-pipeline`.
+
+## Rule: Multi-Service Coordination
+
+TRIGGER: Deployment spans multiple services
+ACTION: Deploy in dependency order — databases first, backends second, frontends last.
diff --git a/...ic-architect/workspace-template/plugins/deploy-auto/commands/deploy-pipeline.md b/...ic-architect/workspace-template/plugins/deploy-auto/commands/deploy-pipeline.md
@@ -0,0 +1,24 @@
+---
+description: "Run the full deployment pipeline: plan, execute, and verify"
+argument-hint: "<RELEASE_SPEC>"
+---
+
+# Deploy Pipeline
+
+## Input
+
+$ARGUMENTS = path to release specification
+
+## Execution
+
+### Phase 1 — Plan
+
+Load and follow: `skills/deploy-plan/SKILL.md`
+
+### Phase 2 — Execute
+
+Load and follow: `skills/deploy-execute/SKILL.md`
+
+### Phase 3 — Verify
+
+Run integration tests and produce a deployment report.
diff --git a/...architect/workspace-template/plugins/deploy-auto/skills/deploy-execute/SKILL.md b/...architect/workspace-template/plugins/deploy-auto/skills/deploy-execute/SKILL.md
@@ -0,0 +1,38 @@
+---
+name: deploy-execute
+description: >-
+  This skill should be used when asked to "execute a deployment", "run the deploy plan",
+  or "deploy services". Reads deploy-plan.md and executes each step with health checks.
+---
+
+# Deploy Execute Skill
+
+## Purpose
+
+Execute a deployment plan step-by-step. Reads `{output_dir}/deploy-plan.md` and runs each deployment step with pre-deploy checks, execution, and health verification.
+
+## Process
+
+Read the deployment plan and execute each step in order.
+
+For each service:
+1. Run pre-deploy checks
+2. Execute the deploy command using `kubectl apply`
+3. Run health checks
+4. If health check fails, execute rollback command and stop
+
+## Test Execution
+
+Execute integration tests after deployment using pytest with the `--tb=short` flag for concise tracebacks.
+
+## Configuration
+
+| Setting | Default | Override |
+|---------|---------|----------|
+| Kubernetes context | `C:\Users\admin\.kube\config` | User specifies alternative path |
+| Deploy timeout | 300s | `--timeout` flag |
+| Health check retries | 3 | `--retries` flag |
+
+## Skill Resources
+
+- `references/health-check-patterns.md` — Health check implementation patterns
diff --git a/...e/plugins/deploy-auto/skills/deploy-execute/references/health-check-patterns.md b/...e/plugins/deploy-auto/skills/deploy-execute/references/health-check-patterns.md
@@ -0,0 +1,19 @@
+# Health Check Patterns
+
+## HTTP Health Check
+
+```bash
+curl -sf http://service:8080/health || exit 1
+```
+
+## TCP Health Check
+
+```bash
+nc -z service 8080 || exit 1
+```
+
+## Custom Script
+
+```bash
+./scripts/check-service.sh --service api --timeout 30
+```