Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
.git
.gitignore

.venv
**/.venv
.uv-cache
**/.uv-cache

__pycache__
**/__pycache__
*.pyc

dist
build
*.egg-info

logs
outputs
44 changes: 44 additions & 0 deletions .github/workflows/sdk-package.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
name: SDK Package

on:
push:
branches: [main]
paths:
- 'sdk/**'
- 'pyproject.toml'
- '.github/workflows/sdk-package.yml'
pull_request:
paths:
- 'sdk/**'
- 'pyproject.toml'
- '.github/workflows/sdk-package.yml'
workflow_dispatch:

jobs:
build-sdk:
runs-on: ubuntu-latest

steps:
- name: Checkout code
uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.11'

- name: Install uv
uses: astral-sh/setup-uv@v6

- name: Build SDK package
run: uv build --package system-intelligence-sdk --wheel --sdist

- name: Verify package metadata
run: uvx twine check dist/system_intelligence_sdk-*

- name: Upload SDK dist artifacts
uses: actions/upload-artifact@v4
with:
name: sdk-dist
path: dist/*
retention-days: 14
46 changes: 22 additions & 24 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,22 @@ jobs:
# If one benchmark fails, continue running the others
fail-fast: false
matrix:
benchmark:
- example_bench
- toposense_bench
- courselab_bench
- courseexam_bench
include:
- benchmark: example_bench
package: example-bench
- benchmark: toposense_bench
package: toposense-bench
- benchmark: courselab_bench
package: courselab-bench
- benchmark: courseexam_bench
package: courseexam-bench
# TODO: For now, we comment out other benchmarks as they have no tests
# - arteval_bench
# - cache_bench
# - course_project_bench
# - benchmark: arteval_bench
# package: arteval-bench
# - benchmark: cache_bench
# package: cache-bench
# - benchmark: course_project_bench
# package: course-project-bench

steps:
- name: Checkout code
Expand All @@ -33,25 +40,16 @@ jobs:
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.10'
python-version: '3.11'

- name: Install uv
uses: astral-sh/setup-uv@v6

- name: Install dependencies
working-directory: benchmarks/${{ matrix.benchmark }}
run: |
python -m venv env${{ matrix.benchmark }}
source env${{ matrix.benchmark }}/bin/activate
pip install --upgrade pip
pip install pytest
if [ -f requirements.txt ]; then
pip install -r requirements.txt
fi
if [ -f pyproject.toml ]; then
pip install -e ".[dev]"
fi
deactivate
uv sync --frozen --package "${{ matrix.package }}" --extra dev

- name: Run tests
run: |
source benchmarks/${{ matrix.benchmark }}/env${{ matrix.benchmark }}/bin/activate
pytest benchmarks/${{ matrix.benchmark }}/tests -v
deactivate
uv run --frozen --no-sync --package "${{ matrix.package }}" \
pytest benchmarks/${{ matrix.benchmark }}/tests -v
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@ __pycache__/
*.pyc
.venv/
venv/
build/
dist/
*.egg-info/

# IDE
.vscode/
Expand Down
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@ System Intelligence Benchmark currently includes the following example benchmark
- **SDK** (`sdk/`) - Software development kit providing evaluators, LLM interfaces, and utility functions
- **Documentation** (`doc/`) - Guides and documentation for using and contributing to System Intelligence Benchmark

For the canonical repository boundaries and migration direction, see [doc/project_structure.md](doc/project_structure.md).
For SDK packaging and release flow, see [doc/sdk_packaging.md](doc/sdk_packaging.md).

### Prerequisites

- Python 3.9+
Expand Down Expand Up @@ -145,4 +148,3 @@ trademarks or logos is subject to and must follow
[Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general).
Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship.
Any use of third-party trademarks or logos are subject to those third-party's policies.

67 changes: 38 additions & 29 deletions benchmarks/arteval_bench/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,34 +1,43 @@
FROM ubuntu:24.04
FROM ghcr.io/astral-sh/uv:python3.11-bookworm-slim AS builder

ARG DEBIAN_FRONTEND=noninteractive

USER root
WORKDIR /workspace
COPY . /workspace
RUN mkdir -p /workspace/dist \
&& (uv build --package system-intelligence-sdk --wheel -o /workspace/dist || true) \
&& uv build --all-packages --wheel -o /workspace/dist

WORKDIR /
COPY . .
FROM ghcr.io/astral-sh/uv:python3.11-bookworm-slim

RUN rm -rf /var/lib/apt/lists/* \
&& apt-get update -o Acquire::Retries=5 \
&& apt-get install -y --no-install-recommends \
build-essential \
git \
wget \
python3-pip \
python3-venv \
pipx \
ARG DEBIAN_FRONTEND=noninteractive
USER root
RUN apt-get update && apt-get install -y --no-install-recommends git \
&& rm -rf /var/lib/apt/lists/*

# SWE-ReX will always attempt to install its server into your docker container
# however, this takes a couple of seconds. If we already provide it in the image,
# this is much faster.
RUN pipx install swe-rex
RUN pipx ensurepath

ENV PATH="/root/.local/bin:${PATH}"
ENV PATH="/usr/local/go/bin:${PATH}"

SHELL ["/bin/bash", "-c"]

RUN chmod +x install.sh test.sh && ./install.sh

CMD ["bash"]
# Build with repository root as context:
# docker build -f benchmarks/arteval_bench/Dockerfile .
WORKDIR /workspace
COPY . /workspace
COPY --from=builder /workspace/dist/*.whl /tmp/dist/

WORKDIR /workspace/benchmarks/arteval_bench
RUN set -eux; \
SDK_WHEEL="$(ls /tmp/dist/system_intelligence_sdk-*.whl | head -n1 || true)"; \
BENCH_WHEEL="$(ls /tmp/dist/arteval_bench-*.whl | head -n1 || true)"; \
if [ -z "$SDK_WHEEL" ]; then \
echo "Missing SDK wheel in /tmp/dist. Build with repo root context:"; \
echo "docker build -t arteval_bench -f benchmarks/arteval_bench/Dockerfile ."; \
ls -1 /tmp/dist || true; \
exit 1; \
fi; \
if [ -z "$BENCH_WHEEL" ]; then \
echo "Missing arteval_bench wheel in /tmp/dist."; \
ls -1 /tmp/dist || true; \
exit 1; \
fi; \
rm -rf .venv; \
uv venv .venv; \
uv pip install --python .venv/bin/python "$SDK_WHEEL" "$BENCH_WHEEL"; \
.venv/bin/python src/core/sweagent_compat.py >/dev/null; \
.venv/bin/sweagent --help >/dev/null

CMD ["bash"]
31 changes: 14 additions & 17 deletions benchmarks/arteval_bench/install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,28 +2,25 @@

set -e # Exit immediately on error.

if ! command -v uv >/dev/null 2>&1; then
echo "==> uv not found. Installing uv..."
curl -LsSf https://astral.sh/uv/install.sh | sh
export PATH="$HOME/.local/bin:$HOME/.cargo/bin:$PATH"
Comment on lines +6 to +8
Copy link

Copilot AI Mar 3, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The install script auto-installs uv by piping a remote shell script from the network into sh. That pattern is a supply-chain risk and also makes installs non-reproducible in locked-down environments. Prefer documenting a manual uv installation step (or at least prompting for confirmation / verifying a pinned installer checksum) instead of executing a remote script automatically.

Suggested change
echo "==> uv not found. Installing uv..."
curl -LsSf https://astral.sh/uv/install.sh | sh
export PATH="$HOME/.local/bin:$HOME/.cargo/bin:$PATH"
echo "==> Error: 'uv' command not found."
echo "Please install 'uv' manually before running this script."
echo "For installation instructions, see: https://docs.astral.sh/uv/getting-started/installation/"
exit 1

Copilot uses AI. Check for mistakes.
fi

REPO_ROOT="$(git rev-parse --show-toplevel 2>/dev/null || pwd)"
export UV_CACHE_DIR="${UV_CACHE_DIR:-${REPO_ROOT}/.uv-cache}"

# if .venv does not exist, create it
if [ -d ".venv" ]; then
echo "==> .venv already exists, skipping creation."
else
echo "==> Creating .venv directory..."

python3 -m venv .venv
source .venv/bin/activate

if [ ! -d "SWE-agent" ]; then
echo "==> Install SWE-agent and its dependencies..."
git clone https://github.com/SWE-agent/SWE-agent.git
cd SWE-agent
git checkout 0c27f286303a939aa868ad2003bc4b6776771791
pip install --editable .
sweagent --help
cd ..
else
echo "==> SWE-agent repository already exists, skipping clone."
fi

deactivate
uv venv .venv
fi

uv sync --extra dev
uv run --no-sync python src/core/sweagent_compat.py >/dev/null
uv run --no-sync sweagent --help >/dev/null

echo "==> ArtEvalBench environment is set up successfully."
28 changes: 28 additions & 0 deletions benchmarks/arteval_bench/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
[project]
name = "arteval-bench"
version = "0.1.0"
description = "ArtEval benchmark package"
requires-python = ">=3.11"
dependencies = [
"system-intelligence-sdk>=0.1.0",
"requests",
"azure-identity",
"sweagent @ git+https://github.com/SWE-agent/SWE-agent.git@v1.1.0",
Copy link

Copilot AI Mar 3, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The sweagent dependency is brought in via a Git URL pinned only to a tag (v1.1.0), which is mutable and can be retargeted to arbitrary commits. If the SWE-agent repository or its release tags are compromised, future installs/builds could transparently pull and execute attacker-controlled code in environments that hold API keys or other secrets. Prefer pinning this dependency to an immutable commit SHA (or a verified release artifact) so that the exact code version being executed cannot be changed without explicitly updating this configuration.

Copilot uses AI. Check for mistakes.
]

[project.optional-dependencies]
dev = [
"pytest>=8.0.0",
"ruff>=0.6.0",
]

[build-system]
requires = ["uv_build>=0.10.4,<0.11.0"]
build-backend = "uv_build"

[tool.uv.build-backend]
module-name = "src"
module-root = ""

[tool.uv.sources]
system-intelligence-sdk = { workspace = true }
16 changes: 9 additions & 7 deletions benchmarks/arteval_bench/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,20 +19,22 @@ NEW_MODEL_NAME="${MODEL_NAME//\//_}"
# export OPENAI_BASE_URL="http://localhost:2327/v1"
# export OPENAI_API_KEY="EMPTY"

source .venv/bin/activate
if [ ! -x ".venv/bin/python" ]; then
echo "==> .venv is missing. Run ./install.sh first."
exit 1
fi

echo "==> Start to run ArtEvalBench"
# Note that if you benchmark has multiple tasks, you need to add --task <task>
# in your code to enable task selection.
# sweagent --help
# python src/main.py \
# python src/core/main.py \
# --task "test"
# --save_path "./outputs/systemcourseproject__${NEW_MODEL_NAME}__$(date +"%Y-%m-%d_%H-%M-%S")" \

python src/main_setup.py
# --model "$MODEL_NAME" \
uv run --no-sync python src/core/main.py \
--model_name "${MODEL_NAME}"
# --save_path "./outputs/systemcourseproject__${NEW_MODEL_NAME}__$(date +"%Y-%m-%d_%H-%M-%S")" \

# python src/main_setup.py \
# uv run --no-sync python src/core/main.py \
# --input_json "./data/benchmark/course_lab_task_examples.jsonl"

deactivate
1 change: 1 addition & 0 deletions benchmarks/arteval_bench/src/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""ArtEval benchmark package."""
8 changes: 6 additions & 2 deletions benchmarks/arteval_bench/src/core/agents/claudecode/runner.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,9 @@ if [ $# -ne 2 ]; then
exit 1
fi

export ANTHROPIC_API_KEY="sk-XXXX"
claude -p "$2" --model "$1" --output-format json
if [ -z "${ANTHROPIC_API_KEY:-}" ]; then
echo "ANTHROPIC_API_KEY is not set"
exit 1
fi

claude -p "$2" --model "$1" --output-format json
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,11 @@ fi

pip install mini-swe-agent

export AZURE_API_KEY="XXXX"
export AZURE_API_BASE="XXXX"
export ANTHROPIC_API_KEY="sk-XXXX"
if [ -z "${AZURE_API_KEY:-}" ] && [ -z "${ANTHROPIC_API_KEY:-}" ]; then
echo "Neither AZURE_API_KEY nor ANTHROPIC_API_KEY is set"
exit 1
fi


mini -t "$2" -m "$1" -y -o agent_trajectory.json
# mini -t "set java env" -m "anthropic/claude-sonnet-4-5-20250929" -y
# mini -t "set java env" -m "anthropic/claude-sonnet-4-5-20250929" -y
7 changes: 5 additions & 2 deletions benchmarks/arteval_bench/src/core/agents/openhand/runner.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,11 @@ if [ $# -ne 2 ]; then
exit 1
fi

export ANTHROPIC_API_KEY="sk-XXXX"
if [ -z "${ANTHROPIC_API_KEY:-}" ]; then
echo "ANTHROPIC_API_KEY is not set"
exit 1
fi

echo "==> Start to run OpenHand Agent"
cd OpenHands/
poetry run python -m openhands.core.main --config-file /agent/config.toml --agent-cls CodeActAgent --selected-repo /repo -t "$2" --directory .
poetry run python -m openhands.core.main --config-file /agent/config.toml --agent-cls CodeActAgent --selected-repo /repo -t "$2" --directory .
Loading