Skip to content

chore(CI): Debug artifact download #4

chore(CI): Debug artifact download

chore(CI): Debug artifact download #4

name: Weekly Model Evaluation
on:
schedule:
- cron: '0 6 * * 1' # Every Monday at 6:00 UTC
pull_request: # TODO: Remove before merging — only for debugging artifact paths
paths:
- '.github/workflows/model-evaluation.yml'
workflow_dispatch:
inputs:
models:
description: 'Comma-separated list of model IDs to evaluate'
required: false
default: 'gpt-5-mini'
concurrency:
group: model-evaluation
cancel-in-progress: true
jobs:
prepare:
name: Prepare Model Matrix
runs-on: ubuntu-latest
outputs:
matrix: ${{ steps.set-matrix.outputs.matrix }}
date: ${{ steps.set-date.outputs.date }}
steps:
- name: Set date
id: set-date
run: echo "date=$(date +%Y-%m-%d)" >> "$GITHUB_OUTPUT"
- name: Set matrix
id: set-matrix
run: |
MODELS="${{ inputs.models || 'gpt-5-mini' }}"
MATRIX=$(echo "$MODELS" | tr ',' '\n' | jq -R . | jq -sc '{"model": .}')
echo "matrix=$MATRIX" >> "$GITHUB_OUTPUT"
evaluate:
name: Evaluate ${{ matrix.model }}
needs: prepare
runs-on: ubuntu-latest
strategy:
matrix: ${{ fromJson(needs.prepare.outputs.matrix) }}
fail-fast: false
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Set up Go
uses: actions/setup-go@v5
with:
go-version-file: go.mod
- name: Setup proto files
run: make proto-setup
- name: Generate proto descriptors
run: make proto-generate
- name: Download WireMock
run: make mock-download
- name: Run E2E tests
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
MODEL_KEY: ${{ secrets.OPENAI_API_KEY }}
MODEL_NAME: "openai:${{ matrix.model }}"
run: make e2e-test
- name: Upload results
if: always()
uses: actions/upload-artifact@v4
with:
name: eval-results-${{ matrix.model }}
path: e2e-tests/mcpchecker/mcpchecker-stackrox-mcp-e2e-out.json
if-no-files-found: error
update-docs:
name: Update Documentation & Create PR
needs: [prepare, evaluate]
runs-on: ubuntu-latest
permissions:
contents: write
pull-requests: write
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Download all results
uses: actions/download-artifact@v8
with:
# Download all artifacts instead of using pattern matching.
# When pattern matches a single artifact, download-artifact extracts
# it directly into the path without creating a subdirectory, breaking
# the expected path structure. Downloading all artifacts always creates
# per-artifact subdirectories.
path: eval-results
- name: Debug - list artifact structure
run: |
echo "=== eval-results directory structure ==="
ls -R eval-results || echo "eval-results directory not found"
- name: Update model evaluation docs
run: |
MODELS="${{ inputs.models || 'gpt-5-mini' }}"
for MODEL in $(echo "$MODELS" | tr ',' ' '); do
RESULTS_FILE="eval-results/eval-results-${MODEL}/mcpchecker-stackrox-mcp-e2e-out.json"
if [ -f "$RESULTS_FILE" ]; then
echo "Updating docs for model: ${MODEL}"
./scripts/update-model-evaluation.sh \
--model-id "${MODEL}" \
--results "${RESULTS_FILE}"
else
echo "::warning::No results found for model ${MODEL}"
fi
done
- name: Clean up eval results
run: rm -rf eval-results
- name: Check for changes
id: check-changes
run: |
if git diff --quiet docs/model-evaluation.md; then
echo "changed=false" >> "$GITHUB_OUTPUT"
else
echo "changed=true" >> "$GITHUB_OUTPUT"
fi
# TODO: Re-enable after debugging artifact paths
# - name: Create Pull Request
# if: steps.check-changes.outputs.changed == 'true'
# uses: peter-evans/create-pull-request@v7
# with:
# branch: chore/update-model-evaluation-${{ needs.prepare.outputs.date }}
# commit-message: "Update model evaluations ${{ needs.prepare.outputs.date }}"
# title: "chore(evals): Update model evaluations ${{ needs.prepare.outputs.date }}"
# body: |
# Automated weekly model evaluation update.
#
# **Models evaluated:** ${{ inputs.models || 'gpt-5-mini' }}
# **Date:** ${{ needs.prepare.outputs.date }}
#
# This PR was automatically generated by the [Model Evaluation workflow](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}).
# base: main