Weekly Model Evaluation #1
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Weekly Model Evaluation | |
| on: | |
| schedule: | |
| - cron: '0 6 * * 1' # Every Monday at 6:00 UTC | |
| workflow_dispatch: | |
| inputs: | |
| models: | |
| description: 'Comma-separated list of model IDs to evaluate' | |
| required: false | |
| default: 'gpt-5-mini' | |
| concurrency: | |
| group: model-evaluation | |
| cancel-in-progress: true | |
| jobs: | |
| prepare: | |
| name: Prepare Model Matrix | |
| runs-on: ubuntu-latest | |
| outputs: | |
| matrix: ${{ steps.set-matrix.outputs.matrix }} | |
| date: ${{ steps.set-date.outputs.date }} | |
| steps: | |
| - name: Set date | |
| id: set-date | |
| run: echo "date=$(date +%Y-%m-%d)" >> "$GITHUB_OUTPUT" | |
| - name: Set matrix | |
| id: set-matrix | |
| run: | | |
| MODELS="${{ inputs.models || 'gpt-5-mini' }}" | |
| MATRIX=$(echo "$MODELS" | tr ',' '\n' | jq -R . | jq -sc '{"model": .}') | |
| echo "matrix=$MATRIX" >> "$GITHUB_OUTPUT" | |
| evaluate: | |
| name: Evaluate ${{ matrix.model }} | |
| needs: prepare | |
| runs-on: ubuntu-latest | |
| strategy: | |
| matrix: ${{ fromJson(needs.prepare.outputs.matrix) }} | |
| fail-fast: false | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| - name: Set up Go | |
| uses: actions/setup-go@v5 | |
| with: | |
| go-version-file: go.mod | |
| - name: Setup proto files | |
| run: make proto-setup | |
| - name: Generate proto descriptors | |
| run: make proto-generate | |
| - name: Download WireMock | |
| run: make mock-download | |
| - name: Run E2E tests | |
| env: | |
| OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} | |
| MODEL_KEY: ${{ secrets.OPENAI_API_KEY }} | |
| MODEL_NAME: "openai:${{ matrix.model }}" | |
| run: make e2e-test | |
| - name: Upload results | |
| if: always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: eval-results-${{ matrix.model }} | |
| path: e2e-tests/mcpchecker/mcpchecker-stackrox-mcp-e2e-out.json | |
| if-no-files-found: error | |
| update-docs: | |
| name: Update Documentation & Create PR | |
| needs: [prepare, evaluate] | |
| runs-on: ubuntu-latest | |
| permissions: | |
| contents: write | |
| pull-requests: write | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| - name: Download all results | |
| uses: actions/download-artifact@v8 | |
| with: | |
| pattern: eval-results-* | |
| path: eval-results | |
| - name: Update model evaluation docs | |
| run: | | |
| MODELS="${{ inputs.models || 'gpt-5-mini' }}" | |
| for MODEL in $(echo "$MODELS" | tr ',' ' '); do | |
| RESULTS_FILE="eval-results/eval-results-${MODEL}/mcpchecker-stackrox-mcp-e2e-out.json" | |
| if [ -f "$RESULTS_FILE" ]; then | |
| echo "Updating docs for model: ${MODEL}" | |
| ./scripts/update-model-evaluation.sh \ | |
| --model-id "${MODEL}" \ | |
| --results "${RESULTS_FILE}" | |
| else | |
| echo "::warning:: No results found for model ${MODEL}" | |
| fi | |
| done | |
| - name: Check for changes | |
| id: check-changes | |
| run: | | |
| if git diff --quiet docs/model-evaluation.md; then | |
| echo "changed=false" >> "$GITHUB_OUTPUT" | |
| else | |
| echo "changed=true" >> "$GITHUB_OUTPUT" | |
| fi | |
| - name: Create Pull Request | |
| if: steps.check-changes.outputs.changed == 'true' | |
| uses: peter-evans/create-pull-request@v7 | |
| with: | |
| branch: chore/update-model-evaluation-${{ needs.prepare.outputs.date }} | |
| commit-message: "Update model evaluations ${{ needs.prepare.outputs.date }}" | |
| title: "chore(evals): Update model evaluations ${{ needs.prepare.outputs.date }}" | |
| body: | | |
| Automated weekly model evaluation update. | |
| **Models evaluated:** ${{ inputs.models || 'gpt-5-mini' }} | |
| **Date:** ${{ needs.prepare.outputs.date }} | |
| This PR was automatically generated by the [Model Evaluation workflow](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}). | |
| base: main |