Skip to content

Commit 2fd5410

Browse files
authored
ROX-32888: Add weekly job for model evaluation doc update (#103)
1 parent 077457b commit 2fd5410

File tree

1 file changed

+131
-0
lines changed

1 file changed

+131
-0
lines changed
Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
name: Weekly Model Evaluation
2+
3+
on:
4+
schedule:
5+
- cron: '0 6 * * 1' # Every Monday at 6:00 UTC
6+
workflow_dispatch:
7+
inputs:
8+
models:
9+
description: 'Comma-separated list of model IDs to evaluate'
10+
required: false
11+
default: 'gpt-5-mini'
12+
13+
concurrency:
14+
group: model-evaluation
15+
cancel-in-progress: true
16+
17+
jobs:
18+
prepare:
19+
name: Prepare Model Matrix
20+
runs-on: ubuntu-latest
21+
outputs:
22+
matrix: ${{ steps.set-matrix.outputs.matrix }}
23+
date: ${{ steps.set-date.outputs.date }}
24+
steps:
25+
- name: Set date
26+
id: set-date
27+
run: echo "date=$(date +%Y-%m-%d)" >> "$GITHUB_OUTPUT"
28+
29+
- name: Set matrix
30+
id: set-matrix
31+
run: |
32+
MODELS="${{ inputs.models || 'gpt-5-mini' }}"
33+
MATRIX=$(echo "$MODELS" | tr ',' '\n' | jq -R . | jq -sc '{"model": .}')
34+
echo "matrix=$MATRIX" >> "$GITHUB_OUTPUT"
35+
36+
evaluate:
37+
name: Evaluate ${{ matrix.model }}
38+
needs: prepare
39+
runs-on: ubuntu-latest
40+
strategy:
41+
matrix: ${{ fromJson(needs.prepare.outputs.matrix) }}
42+
fail-fast: false
43+
steps:
44+
- name: Checkout code
45+
uses: actions/checkout@v4
46+
47+
- name: Set up Go
48+
uses: actions/setup-go@v5
49+
with:
50+
go-version-file: go.mod
51+
52+
- name: Setup proto files
53+
run: make proto-setup
54+
55+
- name: Generate proto descriptors
56+
run: make proto-generate
57+
58+
- name: Download WireMock
59+
run: make mock-download
60+
61+
- name: Run E2E tests
62+
env:
63+
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
64+
MODEL_KEY: ${{ secrets.OPENAI_API_KEY }}
65+
MODEL_NAME: "openai:${{ matrix.model }}"
66+
run: make e2e-test
67+
68+
- name: Upload results
69+
if: always()
70+
uses: actions/upload-artifact@v4
71+
with:
72+
name: eval-results-${{ matrix.model }}
73+
path: e2e-tests/mcpchecker/mcpchecker-stackrox-mcp-e2e-out.json
74+
if-no-files-found: error
75+
76+
update-docs:
77+
name: Update Documentation & Create PR
78+
needs: [prepare, evaluate]
79+
runs-on: ubuntu-latest
80+
permissions:
81+
contents: write
82+
pull-requests: write
83+
steps:
84+
- name: Checkout code
85+
uses: actions/checkout@v4
86+
87+
- name: Download all results
88+
uses: actions/download-artifact@v8
89+
with:
90+
pattern: eval-results-*
91+
path: eval-results
92+
93+
- name: Update model evaluation docs
94+
run: |
95+
MODELS="${{ inputs.models || 'gpt-5-mini' }}"
96+
for MODEL in $(echo "$MODELS" | tr ',' ' '); do
97+
RESULTS_FILE="eval-results/eval-results-${MODEL}/mcpchecker-stackrox-mcp-e2e-out.json"
98+
if [ -f "$RESULTS_FILE" ]; then
99+
echo "Updating docs for model: ${MODEL}"
100+
./scripts/update-model-evaluation.sh \
101+
--model-id "${MODEL}" \
102+
--results "${RESULTS_FILE}"
103+
else
104+
echo "::warning:: No results found for model ${MODEL}"
105+
fi
106+
done
107+
108+
- name: Check for changes
109+
id: check-changes
110+
run: |
111+
if git diff --quiet docs/model-evaluation.md; then
112+
echo "changed=false" >> "$GITHUB_OUTPUT"
113+
else
114+
echo "changed=true" >> "$GITHUB_OUTPUT"
115+
fi
116+
117+
- name: Create Pull Request
118+
if: steps.check-changes.outputs.changed == 'true'
119+
uses: peter-evans/create-pull-request@v7
120+
with:
121+
branch: chore/update-model-evaluation-${{ needs.prepare.outputs.date }}
122+
commit-message: "Update model evaluations ${{ needs.prepare.outputs.date }}"
123+
title: "chore(evals): Update model evaluations ${{ needs.prepare.outputs.date }}"
124+
body: |
125+
Automated weekly model evaluation update.
126+
127+
**Models evaluated:** ${{ inputs.models || 'gpt-5-mini' }}
128+
**Date:** ${{ needs.prepare.outputs.date }}
129+
130+
This PR was automatically generated by the [Model Evaluation workflow](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}).
131+
base: main

0 commit comments

Comments
 (0)