|
| 1 | +name: Weekly Model Evaluation |
| 2 | + |
| 3 | +on: |
| 4 | + schedule: |
| 5 | + - cron: '0 6 * * 1' # Every Monday at 6:00 UTC |
| 6 | + workflow_dispatch: |
| 7 | + inputs: |
| 8 | + models: |
| 9 | + description: 'Comma-separated list of model IDs to evaluate' |
| 10 | + required: false |
| 11 | + default: 'gpt-5-mini' |
| 12 | + |
| 13 | +concurrency: |
| 14 | + group: model-evaluation |
| 15 | + cancel-in-progress: true |
| 16 | + |
| 17 | +jobs: |
| 18 | + prepare: |
| 19 | + name: Prepare Model Matrix |
| 20 | + runs-on: ubuntu-latest |
| 21 | + outputs: |
| 22 | + matrix: ${{ steps.set-matrix.outputs.matrix }} |
| 23 | + date: ${{ steps.set-date.outputs.date }} |
| 24 | + steps: |
| 25 | + - name: Set date |
| 26 | + id: set-date |
| 27 | + run: echo "date=$(date +%Y-%m-%d)" >> "$GITHUB_OUTPUT" |
| 28 | + |
| 29 | + - name: Set matrix |
| 30 | + id: set-matrix |
| 31 | + run: | |
| 32 | + MODELS="${{ inputs.models || 'gpt-5-mini' }}" |
| 33 | + MATRIX=$(echo "$MODELS" | tr ',' '\n' | jq -R . | jq -sc '{"model": .}') |
| 34 | + echo "matrix=$MATRIX" >> "$GITHUB_OUTPUT" |
| 35 | +
|
| 36 | + evaluate: |
| 37 | + name: Evaluate ${{ matrix.model }} |
| 38 | + needs: prepare |
| 39 | + runs-on: ubuntu-latest |
| 40 | + strategy: |
| 41 | + matrix: ${{ fromJson(needs.prepare.outputs.matrix) }} |
| 42 | + fail-fast: false |
| 43 | + steps: |
| 44 | + - name: Checkout code |
| 45 | + uses: actions/checkout@v4 |
| 46 | + |
| 47 | + - name: Set up Go |
| 48 | + uses: actions/setup-go@v5 |
| 49 | + with: |
| 50 | + go-version-file: go.mod |
| 51 | + |
| 52 | + - name: Setup proto files |
| 53 | + run: make proto-setup |
| 54 | + |
| 55 | + - name: Generate proto descriptors |
| 56 | + run: make proto-generate |
| 57 | + |
| 58 | + - name: Download WireMock |
| 59 | + run: make mock-download |
| 60 | + |
| 61 | + - name: Run E2E tests |
| 62 | + env: |
| 63 | + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} |
| 64 | + MODEL_KEY: ${{ secrets.OPENAI_API_KEY }} |
| 65 | + MODEL_NAME: "openai:${{ matrix.model }}" |
| 66 | + run: make e2e-test |
| 67 | + |
| 68 | + - name: Upload results |
| 69 | + if: always() |
| 70 | + uses: actions/upload-artifact@v4 |
| 71 | + with: |
| 72 | + name: eval-results-${{ matrix.model }} |
| 73 | + path: e2e-tests/mcpchecker/mcpchecker-stackrox-mcp-e2e-out.json |
| 74 | + if-no-files-found: error |
| 75 | + |
| 76 | + update-docs: |
| 77 | + name: Update Documentation & Create PR |
| 78 | + needs: [prepare, evaluate] |
| 79 | + runs-on: ubuntu-latest |
| 80 | + permissions: |
| 81 | + contents: write |
| 82 | + pull-requests: write |
| 83 | + steps: |
| 84 | + - name: Checkout code |
| 85 | + uses: actions/checkout@v4 |
| 86 | + |
| 87 | + - name: Download all results |
| 88 | + uses: actions/download-artifact@v8 |
| 89 | + with: |
| 90 | + pattern: eval-results-* |
| 91 | + path: eval-results |
| 92 | + |
| 93 | + - name: Update model evaluation docs |
| 94 | + run: | |
| 95 | + MODELS="${{ inputs.models || 'gpt-5-mini' }}" |
| 96 | + for MODEL in $(echo "$MODELS" | tr ',' ' '); do |
| 97 | + RESULTS_FILE="eval-results/eval-results-${MODEL}/mcpchecker-stackrox-mcp-e2e-out.json" |
| 98 | + if [ -f "$RESULTS_FILE" ]; then |
| 99 | + echo "Updating docs for model: ${MODEL}" |
| 100 | + ./scripts/update-model-evaluation.sh \ |
| 101 | + --model-id "${MODEL}" \ |
| 102 | + --results "${RESULTS_FILE}" |
| 103 | + else |
| 104 | + echo "::warning:: No results found for model ${MODEL}" |
| 105 | + fi |
| 106 | + done |
| 107 | +
|
| 108 | + - name: Check for changes |
| 109 | + id: check-changes |
| 110 | + run: | |
| 111 | + if git diff --quiet docs/model-evaluation.md; then |
| 112 | + echo "changed=false" >> "$GITHUB_OUTPUT" |
| 113 | + else |
| 114 | + echo "changed=true" >> "$GITHUB_OUTPUT" |
| 115 | + fi |
| 116 | +
|
| 117 | + - name: Create Pull Request |
| 118 | + if: steps.check-changes.outputs.changed == 'true' |
| 119 | + uses: peter-evans/create-pull-request@v7 |
| 120 | + with: |
| 121 | + branch: chore/update-model-evaluation-${{ needs.prepare.outputs.date }} |
| 122 | + commit-message: "Update model evaluations ${{ needs.prepare.outputs.date }}" |
| 123 | + title: "chore(evals): Update model evaluations ${{ needs.prepare.outputs.date }}" |
| 124 | + body: | |
| 125 | + Automated weekly model evaluation update. |
| 126 | +
|
| 127 | + **Models evaluated:** ${{ inputs.models || 'gpt-5-mini' }} |
| 128 | + **Date:** ${{ needs.prepare.outputs.date }} |
| 129 | +
|
| 130 | + This PR was automatically generated by the [Model Evaluation workflow](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}). |
| 131 | + base: main |
0 commit comments