ROX-32888: Add weekly job for model evaluation doc update (#103)

mtodor · web-flow · commit 2fd5410c9bb8 · 2026-04-02T15:34:17.000+02:00
diff --git a/.github/workflows/model-evaluation.yml b/.github/workflows/model-evaluation.yml
@@ -0,0 +1,131 @@
+name: Weekly Model Evaluation
+
+on:
+  schedule:
+    - cron: '0 6 * * 1' # Every Monday at 6:00 UTC
+  workflow_dispatch:
+    inputs:
+      models:
+        description: 'Comma-separated list of model IDs to evaluate'
+        required: false
+        default: 'gpt-5-mini'
+
+concurrency:
+  group: model-evaluation
+  cancel-in-progress: true
+
+jobs:
+  prepare:
+    name: Prepare Model Matrix
+    runs-on: ubuntu-latest
+    outputs:
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
+      date: ${{ steps.set-date.outputs.date }}
+    steps:
+      - name: Set date
+        id: set-date
+        run: echo "date=$(date +%Y-%m-%d)" >> "$GITHUB_OUTPUT"
+
+      - name: Set matrix
+        id: set-matrix
+        run: |
+          MODELS="${{ inputs.models || 'gpt-5-mini' }}"
+          MATRIX=$(echo "$MODELS" | tr ',' '\n' | jq -R . | jq -sc '{"model": .}')
+          echo "matrix=$MATRIX" >> "$GITHUB_OUTPUT"
+
+  evaluate:
+    name: Evaluate ${{ matrix.model }}
+    needs: prepare
+    runs-on: ubuntu-latest
+    strategy:
+      matrix: ${{ fromJson(needs.prepare.outputs.matrix) }}
+      fail-fast: false
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Set up Go
+        uses: actions/setup-go@v5
+        with:
+          go-version-file: go.mod
+
+      - name: Setup proto files
+        run: make proto-setup
+
+      - name: Generate proto descriptors
+        run: make proto-generate
+
+      - name: Download WireMock
+        run: make mock-download
+
+      - name: Run E2E tests
+        env:
+          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+          MODEL_KEY: ${{ secrets.OPENAI_API_KEY }}
+          MODEL_NAME: "openai:${{ matrix.model }}"
+        run: make e2e-test
+
+      - name: Upload results
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: eval-results-${{ matrix.model }}
+          path: e2e-tests/mcpchecker/mcpchecker-stackrox-mcp-e2e-out.json
+          if-no-files-found: error
+
+  update-docs:
+    name: Update Documentation & Create PR
+    needs: [prepare, evaluate]
+    runs-on: ubuntu-latest
+    permissions:
+      contents: write
+      pull-requests: write
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Download all results
+        uses: actions/download-artifact@v8
+        with:
+          pattern: eval-results-*
+          path: eval-results
+
+      - name: Update model evaluation docs
+        run: |
+          MODELS="${{ inputs.models || 'gpt-5-mini' }}"
+          for MODEL in $(echo "$MODELS" | tr ',' ' '); do
+            RESULTS_FILE="eval-results/eval-results-${MODEL}/mcpchecker-stackrox-mcp-e2e-out.json"
+            if [ -f "$RESULTS_FILE" ]; then
+              echo "Updating docs for model: ${MODEL}"
+              ./scripts/update-model-evaluation.sh \
+                --model-id "${MODEL}" \
+                --results "${RESULTS_FILE}"
+            else
+              echo "::warning:: No results found for model ${MODEL}"
+            fi
+          done
+
+      - name: Check for changes
+        id: check-changes
+        run: |
+          if git diff --quiet docs/model-evaluation.md; then
+            echo "changed=false" >> "$GITHUB_OUTPUT"
+          else
+            echo "changed=true" >> "$GITHUB_OUTPUT"
+          fi
+
+      - name: Create Pull Request
+        if: steps.check-changes.outputs.changed == 'true'
+        uses: peter-evans/create-pull-request@v7
+        with:
+          branch: chore/update-model-evaluation-${{ needs.prepare.outputs.date }}
+          commit-message: "Update model evaluations ${{ needs.prepare.outputs.date }}"
+          title: "chore(evals): Update model evaluations ${{ needs.prepare.outputs.date }}"
+          body: |
+            Automated weekly model evaluation update.
+
+            **Models evaluated:** ${{ inputs.models || 'gpt-5-mini' }}
+            **Date:** ${{ needs.prepare.outputs.date }}
+
+            This PR was automatically generated by the [Model Evaluation workflow](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}).
+          base: main