|
| 1 | +kind: Eval |
| 2 | +metadata: |
| 3 | + name: "stackrox-mcp-e2e" |
| 4 | +config: |
| 5 | + agent: |
| 6 | + type: "builtin.claude-code" |
| 7 | + model: "claude-sonnet-4-5" |
| 8 | + llmJudge: |
| 9 | + env: |
| 10 | + baseUrlKey: JUDGE_BASE_URL |
| 11 | + apiKeyKey: JUDGE_API_KEY |
| 12 | + modelNameKey: JUDGE_MODEL_NAME |
| 13 | + mcpConfigFile: mcp-config.yaml |
| 14 | + taskSets: |
| 15 | + # Assertion Fields Explained: |
| 16 | + # - toolsUsed: List of tools that MUST be called at least once |
| 17 | + # - minToolCalls: Minimum TOTAL number of tool calls across ALL tools (not per-tool) |
| 18 | + # - maxToolCalls: Maximum TOTAL number of tool calls across ALL tools (prevents runaway tool usage) |
| 19 | + # Example: If maxToolCalls=3, the agent can make up to 3 tool calls total in the test, |
| 20 | + # regardless of which tools are called. |
| 21 | + |
| 22 | + # Test 1: List clusters |
| 23 | + - path: tasks/list-clusters.yaml |
| 24 | + assertions: |
| 25 | + toolsUsed: |
| 26 | + - server: stackrox-mcp |
| 27 | + toolPattern: "list_clusters" |
| 28 | + minToolCalls: 1 |
| 29 | + maxToolCalls: 1 |
| 30 | + |
| 31 | + # Test 2: CVE detected in workloads |
| 32 | + # Claude does comprehensive CVE checking (orchestrator, deployments, nodes) |
| 33 | + - path: tasks/cve-detected-workloads.yaml |
| 34 | + assertions: |
| 35 | + toolsUsed: |
| 36 | + - server: stackrox-mcp |
| 37 | + toolPattern: "get_deployments_for_cve" |
| 38 | + argumentsMatch: |
| 39 | + cveName: "CVE-2021-31805" |
| 40 | + minToolCalls: 1 |
| 41 | + maxToolCalls: 3 |
| 42 | + |
| 43 | + # Test 3: CVE detected in clusters - basic |
| 44 | + - path: tasks/cve-detected-clusters.yaml |
| 45 | + assertions: |
| 46 | + toolsUsed: |
| 47 | + - server: stackrox-mcp |
| 48 | + toolPattern: "get_clusters_with_orchestrator_cve" |
| 49 | + argumentsMatch: |
| 50 | + cveName: "CVE-2016-1000031" |
| 51 | + minToolCalls: 1 |
| 52 | + maxToolCalls: 3 |
| 53 | + |
| 54 | + # Test 4: Non-existent CVE |
| 55 | + # Expects 3 calls because "Is CVE detected in my clusters?" triggers comprehensive check |
| 56 | + # (orchestrator, deployments, nodes). The LLM cannot know beforehand if CVE exists. |
| 57 | + - path: tasks/cve-nonexistent.yaml |
| 58 | + assertions: |
| 59 | + toolsUsed: |
| 60 | + - server: stackrox-mcp |
| 61 | + toolPattern: "get_clusters_with_orchestrator_cve" |
| 62 | + argumentsMatch: |
| 63 | + cveName: "CVE-2099-00001" |
| 64 | + minToolCalls: 1 |
| 65 | + maxToolCalls: 3 |
| 66 | + |
| 67 | + # Test 5: CVE with specific cluster filter (does exist) |
| 68 | + # Claude does comprehensive checking even for single cluster (orchestrator, deployments, nodes) |
| 69 | + - path: tasks/cve-cluster-does-exist.yaml |
| 70 | + assertions: |
| 71 | + toolsUsed: |
| 72 | + - server: stackrox-mcp |
| 73 | + toolPattern: "list_clusters" |
| 74 | + - server: stackrox-mcp |
| 75 | + toolPattern: "get_clusters_with_orchestrator_cve" |
| 76 | + argumentsMatch: |
| 77 | + cveName: "CVE-2016-1000031" |
| 78 | + minToolCalls: 2 |
| 79 | + maxToolCalls: 4 |
| 80 | + |
| 81 | + # Test 6: CVE with specific cluster filter (does not exist) |
| 82 | + - path: tasks/cve-cluster-does-not-exist.yaml |
| 83 | + assertions: |
| 84 | + toolsUsed: |
| 85 | + - server: stackrox-mcp |
| 86 | + toolPattern: "list_clusters" |
| 87 | + minToolCalls: 1 |
| 88 | + maxToolCalls: 2 |
| 89 | + |
| 90 | + # Test 7: CVE detected in clusters - general |
| 91 | + - path: tasks/cve-clusters-general.yaml |
| 92 | + assertions: |
| 93 | + toolsUsed: |
| 94 | + - server: stackrox-mcp |
| 95 | + toolPattern: "get_clusters_with_orchestrator_cve" |
| 96 | + argumentsMatch: |
| 97 | + cveName: "CVE-2021-31805" |
| 98 | + minToolCalls: 1 |
| 99 | + maxToolCalls: 5 |
| 100 | + |
| 101 | + # Test 8: CVE check with cluster list reference |
| 102 | + - path: tasks/cve-cluster-list.yaml |
| 103 | + assertions: |
| 104 | + toolsUsed: |
| 105 | + - server: stackrox-mcp |
| 106 | + toolPattern: "get_clusters_with_orchestrator_cve" |
| 107 | + argumentsMatch: |
| 108 | + cveName: "CVE-2024-52577" |
| 109 | + minToolCalls: 1 |
| 110 | + maxToolCalls: 5 |
0 commit comments