Skip to content

Weekly Model Evaluation #1

Weekly Model Evaluation

Weekly Model Evaluation #1

name: Weekly Model Evaluation
on:
schedule:
- cron: '0 6 * * 1' # Every Monday at 6:00 UTC
workflow_dispatch:
inputs:
models:
description: 'Comma-separated list of model IDs to evaluate'
required: false
default: 'gpt-5-mini'
concurrency:
group: model-evaluation
cancel-in-progress: true
jobs:
prepare:
name: Prepare Model Matrix
runs-on: ubuntu-latest
outputs:
matrix: ${{ steps.set-matrix.outputs.matrix }}
date: ${{ steps.set-date.outputs.date }}
steps:
- name: Set date
id: set-date
run: echo "date=$(date +%Y-%m-%d)" >> "$GITHUB_OUTPUT"
- name: Set matrix
id: set-matrix
run: |
MODELS="${{ inputs.models || 'gpt-5-mini' }}"
MATRIX=$(echo "$MODELS" | tr ',' '\n' | jq -R . | jq -sc '{"model": .}')
echo "matrix=$MATRIX" >> "$GITHUB_OUTPUT"
evaluate:
name: Evaluate ${{ matrix.model }}
needs: prepare
runs-on: ubuntu-latest
strategy:
matrix: ${{ fromJson(needs.prepare.outputs.matrix) }}
fail-fast: false
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Set up Go
uses: actions/setup-go@v5
with:
go-version-file: go.mod
- name: Setup proto files
run: make proto-setup
- name: Generate proto descriptors
run: make proto-generate
- name: Download WireMock
run: make mock-download
- name: Run E2E tests
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
MODEL_KEY: ${{ secrets.OPENAI_API_KEY }}
MODEL_NAME: "openai:${{ matrix.model }}"
run: make e2e-test
- name: Upload results
if: always()
uses: actions/upload-artifact@v4
with:
name: eval-results-${{ matrix.model }}
path: e2e-tests/mcpchecker/mcpchecker-stackrox-mcp-e2e-out.json
if-no-files-found: error
update-docs:
name: Update Documentation & Create PR
needs: [prepare, evaluate]
runs-on: ubuntu-latest
permissions:
contents: write
pull-requests: write
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Download all results
uses: actions/download-artifact@v8
with:
pattern: eval-results-*
path: eval-results
- name: Update model evaluation docs
run: |
MODELS="${{ inputs.models || 'gpt-5-mini' }}"
for MODEL in $(echo "$MODELS" | tr ',' ' '); do
RESULTS_FILE="eval-results/eval-results-${MODEL}/mcpchecker-stackrox-mcp-e2e-out.json"
if [ -f "$RESULTS_FILE" ]; then
echo "Updating docs for model: ${MODEL}"
./scripts/update-model-evaluation.sh \
--model-id "${MODEL}" \
--results "${RESULTS_FILE}"
else
echo "::warning:: No results found for model ${MODEL}"
fi
done
- name: Check for changes
id: check-changes
run: |
if git diff --quiet docs/model-evaluation.md; then
echo "changed=false" >> "$GITHUB_OUTPUT"
else
echo "changed=true" >> "$GITHUB_OUTPUT"
fi
- name: Create Pull Request
if: steps.check-changes.outputs.changed == 'true'
uses: peter-evans/create-pull-request@v7
with:
branch: chore/update-model-evaluation-${{ needs.prepare.outputs.date }}
commit-message: "Update model evaluations ${{ needs.prepare.outputs.date }}"
title: "chore(evals): Update model evaluations ${{ needs.prepare.outputs.date }}"
body: |
Automated weekly model evaluation update.
**Models evaluated:** ${{ inputs.models || 'gpt-5-mini' }}
**Date:** ${{ needs.prepare.outputs.date }}
This PR was automatically generated by the [Model Evaluation workflow](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}).
base: main