sys-intelligence · Acture · Mar 3, 2026 · Mar 3, 2026 · Mar 3, 2026 · Mar 3, 2026
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1,18 @@
+.git
+.gitignore
+
+.venv
+**/.venv
+.uv-cache
+**/.uv-cache
+
+__pycache__
+**/__pycache__
+*.pyc
+
+dist
+build
+*.egg-info
+
+logs
+outputs
diff --git a/.github/workflows/sdk-package.yml b/.github/workflows/sdk-package.yml
@@ -0,0 +1,44 @@
+name: SDK Package
+
+on:
+  push:
+    branches: [main]
+    paths:
+      - 'sdk/**'
+      - 'pyproject.toml'
+      - '.github/workflows/sdk-package.yml'
+  pull_request:
+    paths:
+      - 'sdk/**'
+      - 'pyproject.toml'
+      - '.github/workflows/sdk-package.yml'
+  workflow_dispatch:
+
+jobs:
+  build-sdk:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v6
+
+      - name: Build SDK package
+        run: uv build --package system-intelligence-sdk --wheel --sdist
+
+      - name: Verify package metadata
+        run: uvx twine check dist/system_intelligence_sdk-*
+
+      - name: Upload SDK dist artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          name: sdk-dist
+          path: dist/*
+          retention-days: 14
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -16,15 +16,22 @@ jobs:
       # If one benchmark fails, continue running the others
       fail-fast: false
       matrix:
-        benchmark:
-          - example_bench
-          - toposense_bench
-          - courselab_bench
-          - courseexam_bench
+        include:
+          - benchmark: example_bench
+            package: example-bench
+          - benchmark: toposense_bench
+            package: toposense-bench
+          - benchmark: courselab_bench
+            package: courselab-bench
+          - benchmark: courseexam_bench
+            package: courseexam-bench
           # TODO: For now, we comment out other benchmarks as they have no tests
-          # - arteval_bench
-          # - cache_bench
-          # - course_project_bench
+          # - benchmark: arteval_bench
+          #   package: arteval-bench
+          # - benchmark: cache_bench
+          #   package: cache-bench
+          # - benchmark: course_project_bench
+          #   package: course-project-bench
 
     steps:
     - name: Checkout code
@@ -33,25 +40,16 @@ jobs:
     - name: Set up Python
       uses: actions/setup-python@v5
       with:
-        python-version: '3.10'
+        python-version: '3.11'
+
+    - name: Install uv
+      uses: astral-sh/setup-uv@v6
 
     - name: Install dependencies
-      working-directory: benchmarks/${{ matrix.benchmark }}
       run: |
-        python -m venv env${{ matrix.benchmark }}
-        source env${{ matrix.benchmark }}/bin/activate
-        pip install --upgrade pip
-        pip install pytest
-        if [ -f requirements.txt ]; then
-          pip install -r requirements.txt
-        fi
-        if [ -f pyproject.toml ]; then
-          pip install -e ".[dev]"
-        fi
-        deactivate
+        uv sync --frozen --package "${{ matrix.package }}" --extra dev
 
     - name: Run tests
       run: |
-        source benchmarks/${{ matrix.benchmark }}/env${{ matrix.benchmark }}/bin/activate
-        pytest benchmarks/${{ matrix.benchmark }}/tests -v
-        deactivate
+        uv run --frozen --no-sync --package "${{ matrix.package }}" \
+          pytest benchmarks/${{ matrix.benchmark }}/tests -v
diff --git a/.gitignore b/.gitignore
@@ -3,6 +3,9 @@ __pycache__/
 *.pyc
 .venv/
 venv/
+build/
+dist/
+*.egg-info/
 
 # IDE
 .vscode/

diff --git a/README.md b/README.md
@@ -38,6 +38,9 @@ System Intelligence Benchmark currently includes the following example benchmark
 - **SDK** (`sdk/`) - Software development kit providing evaluators, LLM interfaces, and utility functions
 - **Documentation** (`doc/`) - Guides and documentation for using and contributing to System Intelligence Benchmark
 
+For the canonical repository boundaries and migration direction, see [doc/project_structure.md](doc/project_structure.md).
+For SDK packaging and release flow, see [doc/sdk_packaging.md](doc/sdk_packaging.md).
+
 ### Prerequisites
 
 - Python 3.9+
@@ -145,4 +148,3 @@ trademarks or logos is subject to and must follow
 [Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general).
 Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship.
 Any use of third-party trademarks or logos are subject to those third-party's policies.
-
diff --git a/benchmarks/arteval_bench/Dockerfile b/benchmarks/arteval_bench/Dockerfile
@@ -1,34 +1,43 @@
-FROM ubuntu:24.04
+FROM ghcr.io/astral-sh/uv:python3.11-bookworm-slim AS builder
 
-ARG DEBIAN_FRONTEND=noninteractive
-
-USER root
+WORKDIR /workspace
+COPY . /workspace
+RUN mkdir -p /workspace/dist \
+ && (uv build --package system-intelligence-sdk --wheel -o /workspace/dist || true) \
+ && uv build --all-packages --wheel -o /workspace/dist
 
-WORKDIR /
-COPY . .
+FROM ghcr.io/astral-sh/uv:python3.11-bookworm-slim
 
-RUN rm -rf /var/lib/apt/lists/* \
- && apt-get update -o Acquire::Retries=5 \
- && apt-get install -y --no-install-recommends \
-    build-essential \
-    git \
-    wget \
-    python3-pip \
-    python3-venv \
-    pipx \
+ARG DEBIAN_FRONTEND=noninteractive
+USER root
+RUN apt-get update && apt-get install -y --no-install-recommends git \
  && rm -rf /var/lib/apt/lists/*
 
-# SWE-ReX will always attempt to install its server into your docker container
-# however, this takes a couple of seconds. If we already provide it in the image,
-# this is much faster.
-RUN pipx install swe-rex 
-RUN pipx ensurepath
-
-ENV PATH="/root/.local/bin:${PATH}"
-ENV PATH="/usr/local/go/bin:${PATH}"
-
-SHELL ["/bin/bash", "-c"]
-
-RUN chmod +x install.sh test.sh && ./install.sh
-
-CMD ["bash"]
+# Build with repository root as context:
+# docker build -f benchmarks/arteval_bench/Dockerfile .
+WORKDIR /workspace
+COPY . /workspace
+COPY --from=builder /workspace/dist/*.whl /tmp/dist/
+
+WORKDIR /workspace/benchmarks/arteval_bench
+RUN set -eux; \
+ SDK_WHEEL="$(ls /tmp/dist/system_intelligence_sdk-*.whl | head -n1 || true)"; \
+ BENCH_WHEEL="$(ls /tmp/dist/arteval_bench-*.whl | head -n1 || true)"; \
+ if [ -z "$SDK_WHEEL" ]; then \
+   echo "Missing SDK wheel in /tmp/dist. Build with repo root context:"; \
+   echo "docker build -t arteval_bench -f benchmarks/arteval_bench/Dockerfile ."; \
+   ls -1 /tmp/dist || true; \
+   exit 1; \
+ fi; \
+ if [ -z "$BENCH_WHEEL" ]; then \
+   echo "Missing arteval_bench wheel in /tmp/dist."; \
+   ls -1 /tmp/dist || true; \
+   exit 1; \
+ fi; \
+ rm -rf .venv; \
+ uv venv .venv; \
+ uv pip install --python .venv/bin/python "$SDK_WHEEL" "$BENCH_WHEEL"; \
+ .venv/bin/python src/core/sweagent_compat.py >/dev/null; \
+ .venv/bin/sweagent --help >/dev/null
+
+CMD ["bash"]
diff --git a/benchmarks/arteval_bench/install.sh b/benchmarks/arteval_bench/install.sh
@@ -2,28 +2,25 @@
 
 set -e  # Exit immediately on error.
 
+if ! command -v uv >/dev/null 2>&1; then
+    echo "==> uv not found. Installing uv..."
+    curl -LsSf https://astral.sh/uv/install.sh | sh
+    export PATH="$HOME/.local/bin:$HOME/.cargo/bin:$PATH"
-    echo "==> uv not found. Installing uv..."
-    curl -LsSf https://astral.sh/uv/install.sh | sh
-    export PATH="$HOME/.local/bin:$HOME/.cargo/bin:$PATH"
+    echo "==> Error: 'uv' command not found."
+    echo "Please install 'uv' manually before running this script."
+    echo "For installation instructions, see: https://docs.astral.sh/uv/getting-started/installation/"
+    exit 1
-    echo "==> uv not found. Installing uv..."
-    curl -LsSf https://astral.sh/uv/install.sh | sh
-    export PATH="$HOME/.local/bin:$HOME/.cargo/bin:$PATH"
+    echo "==> Error: 'uv' command not found."
+    echo "Please install 'uv' manually before running this script."
+    echo "For installation instructions, see: https://docs.astral.sh/uv/getting-started/installation/"
+    exit 1
+fi
+
+REPO_ROOT="$(git rev-parse --show-toplevel 2>/dev/null || pwd)"
+export UV_CACHE_DIR="${UV_CACHE_DIR:-${REPO_ROOT}/.uv-cache}"
+
 # if .venv does not exist, create it
 if [ -d ".venv" ]; then
     echo "==> .venv already exists, skipping creation."
 else
     echo "==> Creating .venv directory..."
-
-    python3 -m venv .venv
-    source .venv/bin/activate
-
-    if [ ! -d "SWE-agent" ]; then
-        echo "==> Install SWE-agent and its dependencies..."
-        git clone https://github.com/SWE-agent/SWE-agent.git
-        cd SWE-agent
-        git checkout 0c27f286303a939aa868ad2003bc4b6776771791
-        pip install --editable .
-        sweagent --help
-        cd ..
-    else
-        echo "==> SWE-agent repository already exists, skipping clone."
-    fi
-
-    deactivate
+    uv venv .venv
 fi
 
+uv sync --extra dev
+uv run --no-sync python src/core/sweagent_compat.py >/dev/null
+uv run --no-sync sweagent --help >/dev/null
+
 echo "==> ArtEvalBench environment is set up successfully."
diff --git a/benchmarks/arteval_bench/pyproject.toml b/benchmarks/arteval_bench/pyproject.toml
@@ -0,0 +1,28 @@
+[project]
+name = "arteval-bench"
+version = "0.1.0"
+description = "ArtEval benchmark package"
+requires-python = ">=3.11"
+dependencies = [
+  "system-intelligence-sdk>=0.1.0",
+  "requests",
+  "azure-identity",
+  "sweagent @ git+https://github.com/SWE-agent/SWE-agent.git@v1.1.0",
+]
+
+[project.optional-dependencies]
+dev = [
+  "pytest>=8.0.0",
+  "ruff>=0.6.0",
+]
+
+[build-system]
+requires = ["uv_build>=0.10.4,<0.11.0"]
+build-backend = "uv_build"
+
+[tool.uv.build-backend]
+module-name = "src"
+module-root = ""
+
+[tool.uv.sources]
+system-intelligence-sdk = { workspace = true }
diff --git a/benchmarks/arteval_bench/run.sh b/benchmarks/arteval_bench/run.sh
@@ -19,20 +19,22 @@ NEW_MODEL_NAME="${MODEL_NAME//\//_}"
 # export OPENAI_BASE_URL="http://localhost:2327/v1"
 # export OPENAI_API_KEY="EMPTY"
 
-source .venv/bin/activate
+if [ ! -x ".venv/bin/python" ]; then
+    echo "==> .venv is missing. Run ./install.sh first."
+    exit 1
+fi
+
 echo "==> Start to run ArtEvalBench"
 # Note that if you benchmark has multiple tasks, you need to add --task <task> 
 # in your code to enable task selection.
 # sweagent --help
-# python src/main.py \
+# python src/core/main.py \
 #     --task "test"
     # --save_path "./outputs/systemcourseproject__${NEW_MODEL_NAME}__$(date +"%Y-%m-%d_%H-%M-%S")" \
 
-python src/main_setup.py
-    # --model "$MODEL_NAME" \
+uv run --no-sync python src/core/main.py \
+    --model_name "${MODEL_NAME}"
     # --save_path "./outputs/systemcourseproject__${NEW_MODEL_NAME}__$(date +"%Y-%m-%d_%H-%M-%S")" \
 
-# python src/main_setup.py \
+# uv run --no-sync python src/core/main.py \
 #     --input_json "./data/benchmark/course_lab_task_examples.jsonl" 
-
-deactivate
diff --git a/benchmarks/arteval_bench/src/__init__.py b/benchmarks/arteval_bench/src/__init__.py
@@ -0,0 +1 @@
+"""ArtEval benchmark package."""
diff --git a/benchmarks/arteval_bench/src/core/agents/claudecode/runner.sh b/benchmarks/arteval_bench/src/core/agents/claudecode/runner.sh
@@ -10,5 +10,9 @@ if [ $# -ne 2 ]; then
     exit 1
 fi
 
-export ANTHROPIC_API_KEY="sk-XXXX"
-claude -p "$2" --model "$1" --output-format json
+if [ -z "${ANTHROPIC_API_KEY:-}" ]; then
+    echo "ANTHROPIC_API_KEY is not set"
+    exit 1
+fi
+
+claude -p "$2" --model "$1" --output-format json
diff --git a/benchmarks/arteval_bench/src/core/agents/minisweagent/runner.sh b/benchmarks/arteval_bench/src/core/agents/minisweagent/runner.sh
@@ -11,10 +11,11 @@ fi
 
 pip install mini-swe-agent
 
-export AZURE_API_KEY="XXXX"
-export AZURE_API_BASE="XXXX"
-export ANTHROPIC_API_KEY="sk-XXXX"
+if [ -z "${AZURE_API_KEY:-}" ] && [ -z "${ANTHROPIC_API_KEY:-}" ]; then
+    echo "Neither AZURE_API_KEY nor ANTHROPIC_API_KEY is set"
+    exit 1
+fi
 
 
 mini -t "$2" -m "$1" -y -o agent_trajectory.json
-# mini -t "set java env" -m "anthropic/claude-sonnet-4-5-20250929" -y
+# mini -t "set java env" -m "anthropic/claude-sonnet-4-5-20250929" -y
diff --git a/benchmarks/arteval_bench/src/core/agents/openhand/runner.sh b/benchmarks/arteval_bench/src/core/agents/openhand/runner.sh
@@ -10,8 +10,11 @@ if [ $# -ne 2 ]; then
     exit 1
 fi
 
-export ANTHROPIC_API_KEY="sk-XXXX"
+if [ -z "${ANTHROPIC_API_KEY:-}" ]; then
+    echo "ANTHROPIC_API_KEY is not set"
+    exit 1
+fi
 
 echo "==> Start to run OpenHand Agent"
 cd OpenHands/
-poetry run python -m openhands.core.main --config-file /agent/config.toml --agent-cls CodeActAgent --selected-repo /repo -t "$2" --directory .
+poetry run python -m openhands.core.main --config-file /agent/config.toml --agent-cls CodeActAgent --selected-repo /repo -t "$2" --directory .
-Original file line number
+Diff line change
@@ Expand Up / @@ -3,6 +3,9 @@ __pycache__/ @@
     *.pyc
     .venv/
     venv/
+    build/
+    dist/
+    *.egg-info/
     # IDE
     .vscode/
@@ Expand Down @@