From f58e69b2621b76fb23fdd04eb4347774b76c00c5 Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Thu, 29 Jan 2026 10:27:07 -0800 Subject: [PATCH 1/4] vibe coded buildkite prototype --- .buildkite/pipeline.yml | 45 ++ .github/workflows/build-runner-image.yml | 107 ++++ docker/kernelbot-runner/Dockerfile | 67 +++ .../kernelbot-runner/requirements-runner.txt | 17 + docs/BUILDKITE_POC.md | 304 ++++++++++ docs/docs/vendor-onboarding/buildkite.md | 243 ++++++++ docs/docs/vendor-onboarding/testing-guide.md | 519 ++++++++++++++++++ scripts/buildkite/setup-agent.sh | 157 ++++++ scripts/buildkite/update-image.sh | 50 ++ src/discord_cluster_manager.egg-info/PKG-INFO | 407 ++++++++++++++ .../SOURCES.txt | 71 +++ .../dependency_links.txt | 1 + .../requires.txt | 23 + .../top_level.txt | 4 + src/kernelbot/env.py | 5 + src/kernelbot/main.py | 11 +- src/libkernelbot/consts.py | 16 +- src/libkernelbot/launchers/__init__.py | 3 +- src/libkernelbot/launchers/buildkite.py | 300 ++++++++++ src/runners/buildkite-runner.py | 56 ++ tests/test_buildkite.py | 192 +++++++ 21 files changed, 2595 insertions(+), 3 deletions(-) create mode 100644 .buildkite/pipeline.yml create mode 100644 .github/workflows/build-runner-image.yml create mode 100644 docker/kernelbot-runner/Dockerfile create mode 100644 docker/kernelbot-runner/requirements-runner.txt create mode 100644 docs/BUILDKITE_POC.md create mode 100644 docs/docs/vendor-onboarding/buildkite.md create mode 100644 docs/docs/vendor-onboarding/testing-guide.md create mode 100755 scripts/buildkite/setup-agent.sh create mode 100755 scripts/buildkite/update-image.sh create mode 100644 src/discord_cluster_manager.egg-info/PKG-INFO create mode 100644 src/discord_cluster_manager.egg-info/SOURCES.txt create mode 100644 src/discord_cluster_manager.egg-info/dependency_links.txt create mode 100644 src/discord_cluster_manager.egg-info/requires.txt create mode 100644 src/discord_cluster_manager.egg-info/top_level.txt create mode 100644 src/libkernelbot/launchers/buildkite.py create mode 100644 src/runners/buildkite-runner.py create mode 100644 tests/test_buildkite.py diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml new file mode 100644 index 00000000..be68002a --- /dev/null +++ b/.buildkite/pipeline.yml @@ -0,0 +1,45 @@ +# Buildkite Pipeline for Kernel Submissions +# +# This pipeline runs kernel submissions on GPU-bound Buildkite agents. +# Each agent is configured with: +# - CUDA_VISIBLE_DEVICES bound to a single GPU +# - CPU/RAM limits via systemd cgroups +# - Queue tag for GPU routing (e.g., queue=nvidia-h100-0) +# +# Environment variables passed from BuildkiteLauncher: +# - SUBMISSION_PAYLOAD: Base64-encoded, zlib-compressed submission config +# - GPU_QUEUE: Queue name for agent routing + +steps: + - label: ":gpu: Run Kernel Submission" + command: "python /opt/kernelbot/buildkite-runner.py" + env: + # Payload is passed via BuildkiteLauncher + SUBMISSION_PAYLOAD: "${SUBMISSION_PAYLOAD}" + agents: + # Route to agent with matching queue tag + queue: "${GPU_QUEUE}" + timeout_in_minutes: 15 + artifact_paths: + - "result.json" + - "profile_data/**/*" + plugins: + - docker#v5.11.0: + image: "ghcr.io/gpu-mode/kernelbot-runner:latest" + always-pull: true + propagate-environment: true + # GPU access - agent already bound to single GPU via CUDA_VISIBLE_DEVICES + gpus: all + # Resource limits (can be overridden via env vars) + memory: "${MEMORY_LIMIT:-32g}" + cpus: "${CPU_LIMIT:-16}" + # Mount working directory for artifacts + volumes: + - ".:/workdir" + workdir: "/workdir" + retry: + automatic: + - exit_status: -1 # Agent lost connection + limit: 1 + - exit_status: 255 # SSH error + limit: 1 diff --git a/.github/workflows/build-runner-image.yml b/.github/workflows/build-runner-image.yml new file mode 100644 index 00000000..996a9424 --- /dev/null +++ b/.github/workflows/build-runner-image.yml @@ -0,0 +1,107 @@ +name: Build Runner Image + +on: + push: + branches: + - main + paths: + - 'docker/kernelbot-runner/**' + - 'src/libkernelbot/**' + - 'src/runners/buildkite-runner.py' + - '.github/workflows/build-runner-image.yml' + pull_request: + paths: + - 'docker/kernelbot-runner/**' + - 'src/libkernelbot/**' + - 'src/runners/buildkite-runner.py' + workflow_dispatch: + inputs: + push: + description: 'Push image to registry' + required: false + default: 'true' + type: boolean + schedule: + # Rebuild weekly on Sundays at 2 AM UTC + - cron: '0 2 * * 0' + +env: + REGISTRY: ghcr.io + IMAGE_NAME: gpu-mode/kernelbot-runner + +jobs: + build: + runs-on: ubuntu-latest + permissions: + contents: read + packages: write + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Log in to Container Registry + if: github.event_name != 'pull_request' + uses: docker/login-action@v3 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Extract metadata for Docker + id: meta + uses: docker/metadata-action@v5 + with: + images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} + tags: | + type=raw,value=latest,enable={{is_default_branch}} + type=sha,prefix=sha- + type=ref,event=branch + type=ref,event=pr + + - name: Build and push Docker image + uses: docker/build-push-action@v5 + with: + context: . + file: docker/kernelbot-runner/Dockerfile + push: ${{ github.event_name != 'pull_request' && (github.event.inputs.push != 'false') }} + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + cache-from: type=gha + cache-to: type=gha,mode=max + + - name: Generate build summary + run: | + echo "## Docker Image Build Summary" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "**Image:** \`${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}\`" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "**Tags:**" >> $GITHUB_STEP_SUMMARY + echo '```' >> $GITHUB_STEP_SUMMARY + echo "${{ steps.meta.outputs.tags }}" >> $GITHUB_STEP_SUMMARY + echo '```' >> $GITHUB_STEP_SUMMARY + + - name: Notify vendors (Slack) + if: github.event_name == 'push' && github.ref == 'refs/heads/main' + continue-on-error: true + uses: slackapi/slack-github-action@v1.25.0 + with: + payload: | + { + "text": "New kernelbot-runner image published", + "blocks": [ + { + "type": "section", + "text": { + "type": "mrkdwn", + "text": "*New kernelbot-runner image published* :package:\n\nVendors: run `./scripts/buildkite/update-image.sh` to update your agents.\n\n<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|View build>" + } + } + ] + } + env: + SLACK_WEBHOOK_URL: ${{ secrets.SLACK_VENDOR_WEBHOOK }} + SLACK_WEBHOOK_TYPE: INCOMING_WEBHOOK diff --git a/docker/kernelbot-runner/Dockerfile b/docker/kernelbot-runner/Dockerfile new file mode 100644 index 00000000..8708f2fe --- /dev/null +++ b/docker/kernelbot-runner/Dockerfile @@ -0,0 +1,67 @@ +# Kernelbot Runner Docker Image +# +# This image is used by Buildkite agents to run kernel submissions. +# It matches the Modal runner configuration for consistent behavior. +# +# Build: +# docker build -t ghcr.io/gpu-mode/kernelbot-runner:latest -f docker/kernelbot-runner/Dockerfile . +# +# Run locally (for testing): +# docker run --gpus '"device=0"' -e SUBMISSION_PAYLOAD="..." kernelbot-runner:latest + +FROM nvidia/cuda:13.1.0-devel-ubuntu24.04 + +LABEL org.opencontainers.image.source="https://github.com/gpu-mode/kernelbot" +LABEL org.opencontainers.image.description="Kernelbot GPU runner for kernel competitions" + +# Install system dependencies +RUN apt-get update && apt-get install -y --no-install-recommends \ + python3.13 \ + python3.13-venv \ + python3-pip \ + git \ + gcc-13 \ + g++-13 \ + clang-18 \ + curl \ + && rm -rf /var/lib/apt/lists/* \ + && ln -sf /usr/bin/python3.13 /usr/bin/python3 \ + && ln -sf /usr/bin/python3.13 /usr/bin/python + +# Create virtual environment +RUN python3 -m venv /opt/venv +ENV PATH="/opt/venv/bin:$PATH" + +# Install Python dependencies (matching modal_runner.py) +COPY docker/kernelbot-runner/requirements-runner.txt /tmp/ +RUN pip install --upgrade pip && \ + pip install -r /tmp/requirements-runner.txt + +# Install PyTorch with CUDA 13.0 support +RUN pip install \ + torch==2.9.1 \ + torchvision \ + torchaudio \ + --index-url https://download.pytorch.org/whl/cu130 + +# Install additional frameworks +RUN pip install \ + tinygrad~=0.10 + +# Install NVIDIA CUDA packages +RUN pip install \ + nvidia-cupynumeric~=25.3 \ + nvidia-cutlass-dsl==4.3.5 \ + "cuda-core[cu13]" \ + "cuda-python[all]==13.0" + +# Copy kernelbot library and runner +WORKDIR /opt/kernelbot +COPY src/libkernelbot /opt/kernelbot/libkernelbot +COPY src/runners/buildkite-runner.py /opt/kernelbot/ + +# Set PYTHONPATH so libkernelbot is importable +ENV PYTHONPATH="/opt/kernelbot:$PYTHONPATH" + +# Default command +CMD ["python", "/opt/kernelbot/buildkite-runner.py"] diff --git a/docker/kernelbot-runner/requirements-runner.txt b/docker/kernelbot-runner/requirements-runner.txt new file mode 100644 index 00000000..8ba03df9 --- /dev/null +++ b/docker/kernelbot-runner/requirements-runner.txt @@ -0,0 +1,17 @@ +# Kernelbot Runner Dependencies +# These should match the Modal runner configuration in modal_runner.py + +# Build tools +ninja~=1.11 +wheel~=0.45 +setuptools + +# Core dependencies +requests~=2.32.4 +packaging~=25.0 +numpy~=2.3 +pytest +PyYAML + +# Triton for GPU kernels +triton diff --git a/docs/BUILDKITE_POC.md b/docs/BUILDKITE_POC.md new file mode 100644 index 00000000..0afe10ed --- /dev/null +++ b/docs/BUILDKITE_POC.md @@ -0,0 +1,304 @@ +# Buildkite Integration POC + +## Executive Summary + +This document describes a proof-of-concept implementation of Buildkite as a new scheduler for Kernelbot GPU kernel competitions. Buildkite solves critical isolation problems that make microbenchmarking on vendor-donated hardware unreliable. + +**Status**: Implementation complete, unit tests passing, ready for integration testing with real Buildkite agents. + +--- + +## Problem Statement + +When vendors donate GPU compute for kernel competitions, we face these challenges: + +| Problem | Impact | +|---------|--------| +| Multiple kernels on same GPU | Measurements become unreliable | +| No CPU/RAM isolation | Neighbor jobs affect benchmarks | +| Complex runner setup | Vendors spend weeks configuring isolation | +| No standardized onboarding | Each vendor does it differently | + +### Current State + +- **Modal**: Good isolation but cloud-only, can't use donated on-prem hardware +- **GitHub Actions**: Runners see all GPUs, no resource limits, complex setup + +--- + +## Solution: Buildkite + +Buildkite provides the primitives we need for proper isolation: + +| Requirement | Buildkite Solution | +|-------------|-------------------| +| 1 GPU per job | 1 agent per GPU, bound via `CUDA_VISIBLE_DEVICES` | +| CPU/RAM limits | Agent runs in systemd cgroup slice | +| No interference | Agent processes 1 job at a time (default) | +| Queue routing | Agent tags route jobs to specific GPUs | +| Easy onboarding | Bootstrap script + Dockerfile in our repo | + +### Architecture + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ Vendor Node (8x H100, 256GB RAM, 128 cores) │ +├─────────────────────────────────────────────────────────────────┤ +│ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │ +│ │ Agent 0 │ │ Agent 1 │ ... │ Agent 7 │ │ +│ │ GPU 0 only │ │ GPU 1 only │ │ GPU 7 only │ │ +│ │ 32GB RAM │ │ 32GB RAM │ │ 32GB RAM │ │ +│ │ 16 CPU cores │ │ 16 CPU cores │ │ 16 CPU cores │ │ +│ │ queue= │ │ queue= │ │ queue= │ │ +│ │ nvidia-h100-0│ │ nvidia-h100-1│ │ nvidia-h100-7│ │ +│ └──────────────┘ └──────────────┘ └──────────────┘ │ +└─────────────────────────────────────────────────────────────────┘ +``` + +### Queue Naming Convention + +Format: `{vendor}-{gpu_type}-{index}` + +Examples: +- `nvidia-h100-0` - NVIDIA-donated H100, first GPU +- `amd-mi300-0` - AMD-donated MI300 +- `google-tpu-0` - Google-donated TPU +- `nebius-h100-0` - Nebius-donated H100 + +This supports concurrent competitions where different vendors donate the same GPU type. + +--- + +## Implementation + +### Files Created + +| File | Purpose | +|------|---------| +| `src/libkernelbot/launchers/buildkite.py` | BuildkiteLauncher class | +| `src/runners/buildkite-runner.py` | Runner script for agents | +| `docker/kernelbot-runner/Dockerfile` | Container image (source of truth) | +| `docker/kernelbot-runner/requirements-runner.txt` | Python dependencies | +| `.buildkite/pipeline.yml` | Buildkite pipeline config | +| `scripts/buildkite/setup-agent.sh` | Agent bootstrap script | +| `scripts/buildkite/update-image.sh` | Image update script | +| `.github/workflows/build-runner-image.yml` | Auto-build on changes | +| `docs/docs/vendor-onboarding/buildkite.md` | Vendor setup guide | +| `docs/docs/vendor-onboarding/testing-guide.md` | Testing instructions | +| `tests/test_buildkite.py` | Unit tests | + +### Files Modified + +| File | Changes | +|------|---------| +| `src/libkernelbot/consts.py` | Added `BuildkiteGPU` enum, `BUILDKITE` scheduler | +| `src/libkernelbot/launchers/__init__.py` | Export `BuildkiteLauncher` | +| `src/kernelbot/env.py` | Buildkite env vars | +| `src/kernelbot/main.py` | Register launcher if token set | + +### Key Code + +**BuildkiteLauncher** (`src/libkernelbot/launchers/buildkite.py`): +```python +class BuildkiteLauncher(Launcher): + def __init__(self, org: str, pipeline: str, token: str): + super().__init__(name="Buildkite", gpus=BuildkiteGPU) + # ... + + async def run_submission(self, config, gpu_type, status) -> FullResult: + # 1. Compress config (zlib + base64) + # 2. Create build via Buildkite API + # 3. Poll for completion + # 4. Download artifacts + # 5. Parse result.json -> FullResult +``` + +**Agent Setup** (`scripts/buildkite/setup-agent.sh`): +```bash +# Creates per-GPU systemd service with: +Environment="CUDA_VISIBLE_DEVICES=${GPU_INDEX}" +Environment="BUILDKITE_AGENT_TAGS=queue=${QUEUE_NAME}" +Slice=buildkite-gpu${GPU_INDEX}.slice # cgroup isolation +``` + +--- + +## Testing + +### Unit Tests (All Passing) + +```bash +uv run pytest tests/test_buildkite.py -v +``` + +``` +tests/test_buildkite.py::TestBuildkiteGPU::test_enum_values PASSED +tests/test_buildkite.py::TestBuildkiteGPU::test_scheduler_type_exists PASSED +tests/test_buildkite.py::TestBuildkiteGPU::test_gpu_lookup PASSED +tests/test_buildkite.py::TestBuildkiteLauncher::test_init PASSED +tests/test_buildkite.py::TestBuildkiteLauncher::test_headers PASSED +tests/test_buildkite.py::TestBuildkiteLauncher::test_payload_compression PASSED +tests/test_buildkite.py::TestBuildkiteLauncher::test_run_submission_creates_build PASSED +tests/test_buildkite.py::TestBuildkiteLauncher::test_run_submission_handles_api_error PASSED +tests/test_buildkite.py::TestBuildkiteLauncher::test_status_updates PASSED +tests/test_buildkite.py::TestBuildkiteRunner::test_runner_script_syntax PASSED + +============================== 10 passed ============================== +``` + +### Import/Integration Tests + +```bash +# Verify imports work +uv run python -c " +from libkernelbot.launchers import BuildkiteLauncher +from libkernelbot.consts import BuildkiteGPU, get_gpu_by_name + +launcher = BuildkiteLauncher(org='test', pipeline='test', token='fake') +print(f'Launcher: {launcher.name}') +print(f'GPUs: {[g.value for g in BuildkiteGPU]}') +" +``` + +Output: +``` +Launcher: Buildkite +GPUs: ['nvidia-h100', 'nvidia-b200', 'nvidia-a100', 'amd-mi300', 'amd-mi250', 'google-tpu', 'nebius-h100'] +``` + +### Local Container Test (For Vendors) + +```bash +# Build image +docker build -t kernelbot-runner:test -f docker/kernelbot-runner/Dockerfile . + +# Test with single GPU +docker run --rm --gpus '"device=0"' \ + -e SUBMISSION_PAYLOAD="$(python3 -c ' +import json, zlib, base64 +config = {"lang": "py", "mode": "test", "files": {"main.py": "import torch; print(torch.cuda.get_device_name(0))"}, "tests": [], "benchmarks": [], "test_timeout": 60, "benchmark_timeout": 60, "ranked_timeout": 60} +print(base64.b64encode(zlib.compress(json.dumps(config).encode())).decode()) +')" \ + kernelbot-runner:test +``` + +--- + +## Vendor Onboarding Flow + +### For Vendors + +1. **Get Buildkite token** from Kernelbot team +2. **Clone repo**: `git clone https://github.com/gpu-mode/kernelbot.git` +3. **Pull image**: `docker pull ghcr.io/gpu-mode/kernelbot-runner:latest` +4. **Run setup script** for each GPU: + ```bash + sudo ./scripts/buildkite/setup-agent.sh 0 nvidia-h100-0 32G 16 + sudo ./scripts/buildkite/setup-agent.sh 1 nvidia-h100-1 32G 16 + # ... for all GPUs + ``` +5. **Set token**: Edit `/etc/buildkite-agent/token` +6. **Start agents**: `sudo systemctl start 'buildkite-agent-gpu*'` +7. **Verify**: Check Buildkite dashboard for connected agents + +### For Kernelbot Team + +1. Set env vars: + ```bash + BUILDKITE_API_TOKEN=bkua_xxxxx + BUILDKITE_ORG=gpu-mode + BUILDKITE_PIPELINE=kernelbot-runner + ``` +2. Launcher auto-registers if token is set +3. Add GPU types to leaderboard configs + +--- + +## Next Steps + +### Immediate (For Integration Testing) + +- [ ] Create Buildkite organization and pipeline +- [ ] Set up 1 test agent on a GPU machine +- [ ] Run end-to-end test via API +- [ ] Compare benchmark results with Modal + +### Before Production + +- [ ] Set up GitHub Container Registry for image +- [ ] Configure Slack webhook for vendor notifications +- [ ] Test with multiple concurrent jobs +- [ ] Document SLA expectations for vendors + +### Future Enhancements + +- [ ] Webhook-based completion (instead of polling) +- [ ] Agent health monitoring dashboard +- [ ] Automatic image version checking +- [ ] Support for non-NVIDIA GPUs (TPU, AMD) + +--- + +## Configuration Reference + +### Environment Variables + +| Variable | Required | Default | Description | +|----------|----------|---------|-------------| +| `BUILDKITE_API_TOKEN` | Yes | - | Buildkite API token | +| `BUILDKITE_ORG` | No | `gpu-mode` | Buildkite org slug | +| `BUILDKITE_PIPELINE` | No | `kernelbot-runner` | Pipeline slug | + +### GPU Types (BuildkiteGPU Enum) + +| Enum Name | Queue Value | Description | +|-----------|-------------|-------------| +| `NVIDIA_H100` | `nvidia-h100` | NVIDIA H100 | +| `NVIDIA_B200` | `nvidia-b200` | NVIDIA B200 | +| `NVIDIA_A100` | `nvidia-a100` | NVIDIA A100 | +| `AMD_MI300` | `amd-mi300` | AMD MI300 | +| `AMD_MI250` | `amd-mi250` | AMD MI250 | +| `GOOGLE_TPU` | `google-tpu` | Google TPU | +| `NEBIUS_H100` | `nebius-h100` | Nebius H100 | + +--- + +## Files Reference + +``` +kernelbot/ +├── .buildkite/ +│ └── pipeline.yml # Buildkite pipeline +├── .github/workflows/ +│ └── build-runner-image.yml # Auto-build Docker image +├── docker/kernelbot-runner/ +│ ├── Dockerfile # Runner container +│ └── requirements-runner.txt # Python deps +├── docs/docs/vendor-onboarding/ +│ ├── buildkite.md # Vendor setup guide +│ └── testing-guide.md # Testing instructions +├── scripts/buildkite/ +│ ├── setup-agent.sh # Agent bootstrap +│ └── update-image.sh # Image updater +├── src/ +│ ├── kernelbot/ +│ │ ├── env.py # +Buildkite env vars +│ │ └── main.py # +Register launcher +│ ├── libkernelbot/ +│ │ ├── consts.py # +BuildkiteGPU enum +│ │ └── launchers/ +│ │ ├── __init__.py # +Export +│ │ └── buildkite.py # BuildkiteLauncher +│ └── runners/ +│ └── buildkite-runner.py # Runner script +└── tests/ + └── test_buildkite.py # Unit tests +``` + +--- + +## Contact + +- **Implementation**: [Your name] +- **Questions**: #kernelbot-infra on Discord +- **Issues**: https://github.com/gpu-mode/kernelbot/issues diff --git a/docs/docs/vendor-onboarding/buildkite.md b/docs/docs/vendor-onboarding/buildkite.md new file mode 100644 index 00000000..8250e9c4 --- /dev/null +++ b/docs/docs/vendor-onboarding/buildkite.md @@ -0,0 +1,243 @@ +--- +sidebar_position: 1 +--- + +# Buildkite Vendor Onboarding + +This guide explains how to set up Buildkite agents on your hardware to run GPU kernel competitions for Kernelbot. + +## Overview + +Kernelbot uses Buildkite to run GPU kernel submissions on vendor-donated hardware. Each GPU on your machine runs as an isolated Buildkite agent with: + +- **GPU Isolation**: Single GPU per agent via `CUDA_VISIBLE_DEVICES` +- **CPU/RAM Limits**: Resource constraints via systemd cgroups +- **Queue Routing**: Jobs routed to specific GPUs via queue tags + +## Prerequisites + +Before setting up agents, ensure you have: + +1. **Linux server** with NVIDIA GPUs (Ubuntu 22.04+ recommended) +2. **Docker** installed with [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html) +3. **Buildkite agent** installed ([installation guide](https://buildkite.com/docs/agent/v3/installation)) +4. **Buildkite organization token** from the Kernelbot team + +### Verify Prerequisites + +```bash +# Check NVIDIA driver +nvidia-smi + +# Check Docker with GPU support +docker run --rm --gpus all nvidia/cuda:13.1.0-base-ubuntu24.04 nvidia-smi + +# Check Buildkite agent +buildkite-agent --version +``` + +## Queue Naming Convention + +Queues follow the pattern: `{vendor}-{gpu_type}-{index}` + +Examples: +- `nvidia-h100-0` - NVIDIA-donated H100, GPU index 0 +- `nvidia-h100-1` - NVIDIA-donated H100, GPU index 1 +- `amd-mi300-0` - AMD-donated MI300 +- `google-tpu-0` - Google-donated TPU +- `nebius-h100-0` - Nebius-donated H100 + +Contact the Kernelbot team to register your queue names. + +## Setup Instructions + +### Step 1: Clone the Repository + +```bash +git clone https://github.com/gpu-mode/kernelbot.git +cd kernelbot +``` + +### Step 2: Pull the Runner Image + +```bash +docker pull ghcr.io/gpu-mode/kernelbot-runner:latest +``` + +### Step 3: Configure Agents + +Run the setup script for each GPU. For an 8-GPU node: + +```bash +# Set your Buildkite token +echo "BUILDKITE_AGENT_TOKEN=your-token-here" | sudo tee /etc/buildkite-agent/token + +# Set up each GPU (adjust queue names for your vendor) +sudo ./scripts/buildkite/setup-agent.sh 0 nvidia-h100-0 32G 16 +sudo ./scripts/buildkite/setup-agent.sh 1 nvidia-h100-1 32G 16 +sudo ./scripts/buildkite/setup-agent.sh 2 nvidia-h100-2 32G 16 +# ... repeat for all GPUs +``` + +Arguments: +- `GPU_INDEX`: GPU device index (0, 1, 2, ...) +- `QUEUE_NAME`: Queue name following convention above +- `MEMORY_LIMIT`: RAM limit per agent (default: 32G) +- `CPU_CORES`: CPU cores per agent (default: 16) + +### Step 4: Start Agents + +```bash +# Start all GPU agents +sudo systemctl start buildkite-agent-gpu0 +sudo systemctl start buildkite-agent-gpu1 +# ... etc + +# Or start all at once +sudo systemctl start 'buildkite-agent-gpu*' +``` + +### Step 5: Verify Setup + +```bash +# Check agent status +sudo systemctl status buildkite-agent-gpu0 + +# View logs +sudo journalctl -u buildkite-agent-gpu0 -f + +# Verify agent appears in Buildkite dashboard +# https://buildkite.com/organizations//agents +``` + +## Testing Your Setup + +### Local Test (Without Buildkite) + +Test the runner image directly: + +```bash +# Create a test payload +TEST_PAYLOAD=$(python3 -c " +import json, zlib, base64 +config = { + 'lang': 'py', + 'mode': 'test', + 'files': {'main.py': 'print(\"Hello GPU!\")'}, + 'tests': [], + 'benchmarks': [] +} +print(base64.b64encode(zlib.compress(json.dumps(config).encode())).decode()) +") + +# Run in container (single GPU) +docker run --rm --gpus '"device=0"' \ + -e SUBMISSION_PAYLOAD="$TEST_PAYLOAD" \ + ghcr.io/gpu-mode/kernelbot-runner:latest + +# Check if result.json would be created +ls -la result.json +``` + +### Integration Test (Via Buildkite) + +Trigger a test build: + +```bash +curl -X POST "https://api.buildkite.com/v2/organizations/gpu-mode/pipelines/kernelbot-runner/builds" \ + -H "Authorization: Bearer $BUILDKITE_API_TOKEN" \ + -H "Content-Type: application/json" \ + -d '{ + "commit": "HEAD", + "branch": "main", + "message": "Test submission", + "env": { + "GPU_QUEUE": "your-queue-name", + "SUBMISSION_PAYLOAD": "'"$TEST_PAYLOAD"'" + } + }' +``` + +Check the Buildkite dashboard for job results. + +### Isolation Verification + +Verify GPU and resource isolation: + +```bash +# Inside agent container, verify only 1 GPU visible +docker run --rm --gpus '"device=0"' nvidia/cuda:13.1.0-base-ubuntu24.04 nvidia-smi +# Should show only GPU 0 + +# Verify cgroup limits +cat /sys/fs/cgroup/buildkite-gpu0.slice/memory.max +cat /sys/fs/cgroup/buildkite-gpu0.slice/cpu.max +``` + +## Updating the Runner Image + +When notified of a new image release: + +```bash +sudo ./scripts/buildkite/update-image.sh +``` + +This pulls the latest image and optionally restarts agents. + +### Automatic Updates (Optional) + +Set up a cron job for automatic updates: + +```bash +# Check for updates daily at 3 AM +echo "0 3 * * * root /path/to/kernelbot/scripts/buildkite/update-image.sh --auto" | sudo tee /etc/cron.d/kernelbot-update +``` + +## Troubleshooting + +### Agent Not Picking Up Jobs + +1. Check agent is running: `systemctl status buildkite-agent-gpu0` +2. Verify queue tag matches: check `/etc/buildkite-agent/agent-0/buildkite-agent.cfg` +3. Ensure agent appears in Buildkite dashboard + +### GPU Not Visible in Container + +1. Check NVIDIA Container Toolkit: `docker run --rm --gpus all nvidia/cuda:13.1.0-base-ubuntu24.04 nvidia-smi` +2. Verify CUDA_VISIBLE_DEVICES is set correctly in systemd unit +3. Check Docker runtime config: `docker info | grep -i runtime` + +### Jobs Timing Out + +1. Check resource limits aren't too restrictive +2. Review job logs in Buildkite dashboard +3. Test image locally first + +### Memory/CPU Limits Not Working + +1. Verify cgroup v2 is enabled: `mount | grep cgroup2` +2. Check slice file exists: `cat /etc/systemd/system/buildkite-gpu0.slice` +3. Reload systemd: `systemctl daemon-reload` + +## Support + +- **Slack**: #kernelbot-infra in GPU Mode Discord +- **Issues**: https://github.com/gpu-mode/kernelbot/issues +- **Email**: infra@gpu-mode.org + +## Hardware Requirements + +Per GPU agent: + +| Resource | Minimum | Recommended | +|----------|---------|-------------| +| RAM | 16 GB | 32 GB | +| CPU Cores | 8 | 16 | +| Disk | 50 GB | 100 GB | +| Network | 100 Mbps | 1 Gbps | + +For an 8-GPU node, plan for: +- 256 GB RAM (32 GB per GPU) +- 128 CPU cores (16 per GPU) +- 800 GB disk +- Fast network for image pulls diff --git a/docs/docs/vendor-onboarding/testing-guide.md b/docs/docs/vendor-onboarding/testing-guide.md new file mode 100644 index 00000000..4cc1c703 --- /dev/null +++ b/docs/docs/vendor-onboarding/testing-guide.md @@ -0,0 +1,519 @@ +--- +sidebar_position: 2 +--- + +# Buildkite Testing Guide + +This guide covers how to test the Buildkite integration at various levels: local development, vendor validation, and end-to-end integration. + +## Testing Levels + +| Level | Who | Purpose | +|-------|-----|---------| +| Unit Tests | Kernelbot developers | Test launcher logic with mocked API | +| Local Container | Vendors | Verify runner image works with GPU | +| Agent Integration | Vendors | Verify agent picks up and runs jobs | +| End-to-End | Both | Full submission flow through Discord/API | + +--- + +## 1. Unit Tests (Kernelbot Developers) + +### Test BuildkiteLauncher with Mocked API + +```python +# tests/test_buildkite_launcher.py +import pytest +from unittest.mock import AsyncMock, patch, MagicMock +import json +import base64 +import zlib + +from libkernelbot.launchers.buildkite import BuildkiteLauncher +from libkernelbot.consts import BuildkiteGPU, GPU + + +@pytest.fixture +def launcher(): + return BuildkiteLauncher( + org="test-org", + pipeline="test-pipeline", + token="test-token" + ) + + +@pytest.fixture +def mock_config(): + return { + "lang": "py", + "mode": "test", + "files": {"main.py": "print('hello')"}, + "tests": [], + "benchmarks": [], + "test_timeout": 180, + } + + +@pytest.fixture +def gpu_type(): + return GPU(name="NVIDIA_H100", value="nvidia-h100", runner="Buildkite") + + +class TestBuildkiteLauncher: + def test_init(self, launcher): + assert launcher.name == "Buildkite" + assert launcher.org == "test-org" + assert launcher.pipeline == "test-pipeline" + assert launcher.gpus == BuildkiteGPU + + @pytest.mark.asyncio + async def test_run_submission_creates_build(self, launcher, mock_config, gpu_type): + """Test that run_submission creates a Buildkite build.""" + mock_response = MagicMock() + mock_response.json.return_value = { + "number": 123, + "web_url": "https://buildkite.com/test/builds/123", + "state": "scheduled", + } + mock_response.raise_for_status = MagicMock() + + with patch("requests.post", return_value=mock_response) as mock_post: + with patch.object(launcher, "_wait_for_completion", new_callable=AsyncMock): + with patch.object(launcher, "_download_and_parse_result", new_callable=AsyncMock) as mock_download: + mock_download.return_value = MagicMock(success=True) + + mock_status = AsyncMock() + result = await launcher.run_submission(mock_config, gpu_type, mock_status) + + # Verify API was called + mock_post.assert_called_once() + call_args = mock_post.call_args + + # Check URL + assert "test-org" in call_args[0][0] + assert "test-pipeline" in call_args[0][0] + + # Check payload was compressed + body = call_args[1]["json"] + assert "SUBMISSION_PAYLOAD" in body["env"] + assert body["env"]["GPU_QUEUE"] == "nvidia-h100" + + @pytest.mark.asyncio + async def test_payload_compression(self, launcher, mock_config, gpu_type): + """Test that config is properly compressed.""" + captured_payload = None + + def capture_post(*args, **kwargs): + nonlocal captured_payload + captured_payload = kwargs["json"]["env"]["SUBMISSION_PAYLOAD"] + response = MagicMock() + response.json.return_value = {"number": 1, "web_url": "http://test", "state": "scheduled"} + response.raise_for_status = MagicMock() + return response + + with patch("requests.post", side_effect=capture_post): + with patch.object(launcher, "_wait_for_completion", new_callable=AsyncMock): + with patch.object(launcher, "_download_and_parse_result", new_callable=AsyncMock): + mock_status = AsyncMock() + await launcher.run_submission(mock_config, gpu_type, mock_status) + + # Decompress and verify + decompressed = zlib.decompress(base64.b64decode(captured_payload)).decode() + parsed = json.loads(decompressed) + assert parsed["lang"] == "py" + assert parsed["mode"] == "test" +``` + +### Run Unit Tests + +```bash +pytest tests/test_buildkite_launcher.py -v +``` + +--- + +## 2. Local Container Tests (Vendors) + +### 2.1 Basic Image Test + +Verify the image runs and has correct dependencies: + +```bash +# Pull the image +docker pull ghcr.io/gpu-mode/kernelbot-runner:latest + +# Check Python and dependencies +docker run --rm ghcr.io/gpu-mode/kernelbot-runner:latest python --version +docker run --rm ghcr.io/gpu-mode/kernelbot-runner:latest pip list | grep torch + +# Check GPU access +docker run --rm --gpus all ghcr.io/gpu-mode/kernelbot-runner:latest nvidia-smi +``` + +### 2.2 Single GPU Isolation Test + +Verify only the specified GPU is visible: + +```bash +# Should only show GPU 0 +docker run --rm --gpus '"device=0"' ghcr.io/gpu-mode/kernelbot-runner:latest nvidia-smi + +# Should only show GPU 1 +docker run --rm --gpus '"device=1"' ghcr.io/gpu-mode/kernelbot-runner:latest nvidia-smi +``` + +### 2.3 Runner Script Test + +Test the runner with a simple payload: + +```bash +# Create test payload +create_test_payload() { + python3 -c " +import json, zlib, base64 +config = { + 'lang': 'py', + 'mode': 'test', + 'files': { + 'main.py': ''' +import torch +print(f\"PyTorch version: {torch.__version__}\") +print(f\"CUDA available: {torch.cuda.is_available()}\") +if torch.cuda.is_available(): + print(f\"GPU: {torch.cuda.get_device_name(0)}\") + print(f\"GPU count: {torch.cuda.device_count()}\") +''' + }, + 'tests': [], + 'benchmarks': [], + 'test_timeout': 60, + 'benchmark_timeout': 60, + 'ranked_timeout': 60, +} +print(base64.b64encode(zlib.compress(json.dumps(config).encode())).decode()) +" +} + +# Run with payload +docker run --rm --gpus '"device=0"' \ + -e SUBMISSION_PAYLOAD="$(create_test_payload)" \ + -v "$(pwd)/test-output:/workdir" \ + -w /workdir \ + ghcr.io/gpu-mode/kernelbot-runner:latest + +# Check output +cat test-output/result.json | jq . +``` + +### 2.4 CUDA Kernel Test + +Test a simple CUDA kernel submission: + +```bash +create_cuda_payload() { + python3 -c " +import json, zlib, base64 +config = { + 'lang': 'py', + 'mode': 'test', + 'files': { + 'main.py': ''' +import torch +import torch.nn as nn + +# Simple GPU operation +x = torch.randn(1000, 1000, device=\"cuda\") +y = torch.randn(1000, 1000, device=\"cuda\") +z = torch.matmul(x, y) +print(f\"Matrix multiply result shape: {z.shape}\") +print(f\"Result sum: {z.sum().item():.2f}\") +''' + }, + 'tests': [], + 'benchmarks': [], + 'test_timeout': 60, + 'benchmark_timeout': 60, + 'ranked_timeout': 60, +} +print(base64.b64encode(zlib.compress(json.dumps(config).encode())).decode()) +" +} + +docker run --rm --gpus '"device=0"' \ + -e SUBMISSION_PAYLOAD="$(create_cuda_payload)" \ + -v "$(pwd)/test-output:/workdir" \ + -w /workdir \ + ghcr.io/gpu-mode/kernelbot-runner:latest +``` + +### 2.5 Resource Limit Test + +Test memory limits are enforced: + +```bash +# Run with memory limit +docker run --rm --gpus '"device=0"' \ + --memory=4g \ + -e SUBMISSION_PAYLOAD="$(create_test_payload)" \ + ghcr.io/gpu-mode/kernelbot-runner:latest + +# Check container saw the limit +docker run --rm --memory=4g ghcr.io/gpu-mode/kernelbot-runner:latest \ + cat /sys/fs/cgroup/memory.max +``` + +--- + +## 3. Agent Integration Tests (Vendors) + +### 3.1 Agent Health Check + +After setting up agents, verify they're healthy: + +```bash +# Check systemd service status +sudo systemctl status buildkite-agent-gpu0 +sudo systemctl status buildkite-agent-gpu1 + +# Check agent logs +sudo journalctl -u buildkite-agent-gpu0 --since "5 minutes ago" + +# Verify agent appears in Buildkite dashboard +curl -s -H "Authorization: Bearer $BUILDKITE_API_TOKEN" \ + "https://api.buildkite.com/v2/organizations/gpu-mode/agents" | jq '.[] | {name, connection_state, metadata}' +``` + +### 3.2 Cgroup Isolation Verification + +Verify resource isolation is working: + +```bash +# Check memory limit +cat /sys/fs/cgroup/buildkite-gpu0.slice/memory.max +# Should show your configured limit (e.g., 34359738368 for 32G) + +# Check CPU quota +cat /sys/fs/cgroup/buildkite-gpu0.slice/cpu.max +# Should show something like "1600000 100000" for 16 cores + +# Verify agent is in the slice +systemctl status buildkite-agent-gpu0 | grep "CGroup" +``` + +### 3.3 GPU Binding Verification + +Verify each agent only sees its assigned GPU: + +```bash +# Check what GPU agent 0 sees +sudo -u buildkite CUDA_VISIBLE_DEVICES=0 nvidia-smi -L +# Should show only GPU 0 + +# Check what GPU agent 1 sees +sudo -u buildkite CUDA_VISIBLE_DEVICES=1 nvidia-smi -L +# Should show only GPU 1 +``` + +### 3.4 Trigger Test Build + +Trigger a test build and verify it runs on correct agent: + +```bash +# Create a test payload +TEST_PAYLOAD=$(python3 -c " +import json, zlib, base64 +config = { + 'lang': 'py', + 'mode': 'test', + 'files': {'main.py': 'import torch; print(torch.cuda.get_device_name(0))'}, + 'tests': [], + 'benchmarks': [], + 'test_timeout': 60, + 'benchmark_timeout': 60, + 'ranked_timeout': 60, +} +print(base64.b64encode(zlib.compress(json.dumps(config).encode())).decode()) +") + +# Trigger build on specific queue +curl -X POST "https://api.buildkite.com/v2/organizations/gpu-mode/pipelines/kernelbot-runner/builds" \ + -H "Authorization: Bearer $BUILDKITE_API_TOKEN" \ + -H "Content-Type: application/json" \ + -d '{ + "commit": "HEAD", + "branch": "main", + "message": "Agent integration test", + "env": { + "GPU_QUEUE": "nvidia-h100-0", + "SUBMISSION_PAYLOAD": "'"$TEST_PAYLOAD"'" + } + }' | jq '{number, web_url, state}' + +# Watch the build +# Check Buildkite dashboard or poll API +``` + +### 3.5 Concurrent Job Test + +Verify jobs don't interfere with each other: + +```bash +# Trigger jobs on different GPUs simultaneously +for i in 0 1 2 3; do + curl -X POST "https://api.buildkite.com/v2/organizations/gpu-mode/pipelines/kernelbot-runner/builds" \ + -H "Authorization: Bearer $BUILDKITE_API_TOKEN" \ + -H "Content-Type: application/json" \ + -d '{ + "commit": "HEAD", + "branch": "main", + "message": "Concurrent test GPU '"$i"'", + "env": { + "GPU_QUEUE": "nvidia-h100-'"$i"'", + "SUBMISSION_PAYLOAD": "'"$TEST_PAYLOAD"'" + } + }' & +done +wait + +# All 4 should run in parallel on different agents +``` + +--- + +## 4. End-to-End Tests (Full System) + +### 4.1 API Submission Test + +Test the full flow through Kernelbot's API: + +```bash +# This requires the full Kernelbot stack running +# Submit via API endpoint +curl -X POST "http://localhost:8000/leaderboard/test-leaderboard/nvidia-h100/test" \ + -H "Content-Type: application/json" \ + -d '{ + "code": "import torch; print(torch.cuda.get_device_name(0))", + "user_id": "test-user", + "user_name": "Test User" + }' +``` + +### 4.2 Discord Bot Test + +Test submission via Discord (manual): + +1. Go to the Discord server with Kernelbot +2. Use `/leaderboard submit test` command +3. Select a Buildkite GPU type (e.g., `nvidia-h100`) +4. Upload a test script +5. Verify the submission runs and returns results + +### 4.3 Benchmark Accuracy Test + +Compare results between launchers: + +```bash +# Run same benchmark on Modal and Buildkite +# Results should be within acceptable variance (< 5% for microbenchmarks) + +# This requires a benchmark that runs on both +# Compare the timing results in the database +``` + +--- + +## 5. Troubleshooting Tests + +### 5.1 Timeout Behavior + +Test that timeouts work correctly: + +```bash +# Create a payload that times out +TIMEOUT_PAYLOAD=$(python3 -c " +import json, zlib, base64 +config = { + 'lang': 'py', + 'mode': 'test', + 'files': {'main.py': 'import time; time.sleep(300)'}, # 5 minutes + 'tests': [], + 'benchmarks': [], + 'test_timeout': 10, # 10 second timeout + 'benchmark_timeout': 10, + 'ranked_timeout': 10, +} +print(base64.b64encode(zlib.compress(json.dumps(config).encode())).decode()) +") + +# Should timeout after ~10 seconds +docker run --rm --gpus '"device=0"' \ + -e SUBMISSION_PAYLOAD="$TIMEOUT_PAYLOAD" \ + ghcr.io/gpu-mode/kernelbot-runner:latest +``` + +### 5.2 Error Handling + +Test error cases: + +```bash +# Missing GPU +docker run --rm \ + -e SUBMISSION_PAYLOAD="$TEST_PAYLOAD" \ + ghcr.io/gpu-mode/kernelbot-runner:latest +# Should fail gracefully with error in result.json + +# Invalid payload +docker run --rm --gpus '"device=0"' \ + -e SUBMISSION_PAYLOAD="not-valid-base64" \ + ghcr.io/gpu-mode/kernelbot-runner:latest +# Should fail with clear error message + +# Missing payload +docker run --rm --gpus '"device=0"' \ + ghcr.io/gpu-mode/kernelbot-runner:latest +# Should fail with "SUBMISSION_PAYLOAD not set" error +``` + +### 5.3 Agent Recovery + +Test agent recovers from failures: + +```bash +# Kill the agent process +sudo systemctl kill -s SIGKILL buildkite-agent-gpu0 + +# Check it restarts automatically +sleep 5 +sudo systemctl status buildkite-agent-gpu0 +# Should show "active (running)" +``` + +--- + +## Test Checklist + +Use this checklist before going live: + +### Vendor Checklist + +- [ ] Image pulls successfully +- [ ] Image runs with GPU access +- [ ] Single GPU isolation works +- [ ] Runner script executes test payload +- [ ] CUDA operations work in container +- [ ] All agents show as connected in Buildkite +- [ ] Cgroup limits are enforced +- [ ] Test build completes successfully +- [ ] Artifacts are uploaded correctly +- [ ] Agent restarts after failure + +### Developer Checklist + +- [ ] Unit tests pass +- [ ] BuildkiteLauncher creates builds +- [ ] Polling works correctly +- [ ] Artifacts are downloaded and parsed +- [ ] Timeouts are handled +- [ ] Errors return proper FullResult +- [ ] GPU enum is registered correctly +- [ ] Launcher is registered in main.py diff --git a/scripts/buildkite/setup-agent.sh b/scripts/buildkite/setup-agent.sh new file mode 100755 index 00000000..9f6e4821 --- /dev/null +++ b/scripts/buildkite/setup-agent.sh @@ -0,0 +1,157 @@ +#!/bin/bash +# +# Buildkite Agent Setup Script for Kernelbot +# +# This script configures a Buildkite agent for a single GPU with proper isolation. +# Each GPU on the node should have its own agent with dedicated resources. +# +# Usage: +# sudo ./setup-agent.sh [memory_limit] [cpu_cores] +# +# Examples: +# sudo ./setup-agent.sh 0 nvidia-h100-0 32G 16 +# sudo ./setup-agent.sh 1 nvidia-h100-1 32G 16 +# +# Prerequisites: +# - Buildkite agent installed: https://buildkite.com/docs/agent/v3/installation +# - Docker installed with NVIDIA runtime +# - BUILDKITE_AGENT_TOKEN set in environment or passed via config +# +# What this script does: +# 1. Creates a systemd service for the agent bound to specific GPU +# 2. Creates a cgroup slice for CPU/RAM isolation +# 3. Configures agent with queue tags for job routing +# + +set -euo pipefail + +# Parse arguments +GPU_INDEX="${1:?GPU index required (e.g., 0, 1, 2...)}" +QUEUE_NAME="${2:?Queue name required (e.g., nvidia-h100-0)}" +MEMORY_LIMIT="${3:-32G}" +CPU_CORES="${4:-16}" + +# Validate GPU exists +if ! nvidia-smi -i "$GPU_INDEX" &>/dev/null; then + echo "Error: GPU $GPU_INDEX not found" + nvidia-smi -L + exit 1 +fi + +echo "Setting up Buildkite agent for GPU $GPU_INDEX with queue $QUEUE_NAME" +echo " Memory limit: $MEMORY_LIMIT" +echo " CPU cores: $CPU_CORES" + +# Create buildkite user if it doesn't exist +if ! id buildkite &>/dev/null; then + echo "Creating buildkite user..." + useradd -r -m -s /bin/bash buildkite + usermod -aG docker buildkite +fi + +# Create agent config directory +AGENT_CONFIG_DIR="/etc/buildkite-agent/agent-${GPU_INDEX}" +mkdir -p "$AGENT_CONFIG_DIR" + +# Create agent configuration file +cat > "$AGENT_CONFIG_DIR/buildkite-agent.cfg" << EOF +# Buildkite Agent Configuration for GPU $GPU_INDEX +# Auto-generated by setup-agent.sh + +name="gpu-${GPU_INDEX}-%hostname-%n" +tags="queue=${QUEUE_NAME}" +build-path="/var/lib/buildkite-agent/builds-gpu${GPU_INDEX}" + +# Hooks directory (optional) +hooks-path="/etc/buildkite-agent/hooks" + +# Plugins directory +plugins-path="/var/lib/buildkite-agent/plugins" + +# Disconnect after job (for clean state) +disconnect-after-job=false +disconnect-after-idle-timeout=0 + +# Enable job log timestamps +timestamp-lines=true +EOF + +# Create build directory +mkdir -p "/var/lib/buildkite-agent/builds-gpu${GPU_INDEX}" +chown -R buildkite:buildkite "/var/lib/buildkite-agent/builds-gpu${GPU_INDEX}" + +# Create cgroup slice for resource isolation +cat > "/etc/systemd/system/buildkite-gpu${GPU_INDEX}.slice" << EOF +[Unit] +Description=Buildkite Agent Slice for GPU ${GPU_INDEX} +Before=slices.target + +[Slice] +MemoryMax=${MEMORY_LIMIT} +CPUQuota=$((CPU_CORES * 100))% +EOF + +# Create systemd service for this GPU +cat > "/etc/systemd/system/buildkite-agent-gpu${GPU_INDEX}.service" << EOF +[Unit] +Description=Buildkite Agent for GPU ${GPU_INDEX} (${QUEUE_NAME}) +Documentation=https://buildkite.com/docs/agent/v3 +After=network.target docker.service +Requires=docker.service + +[Service] +Type=simple +User=buildkite +Group=buildkite + +# GPU isolation - only this GPU is visible +Environment="CUDA_VISIBLE_DEVICES=${GPU_INDEX}" + +# Pass agent token (should be set in environment file) +EnvironmentFile=-/etc/buildkite-agent/token + +# Use agent-specific config +ExecStart=/usr/bin/buildkite-agent start --config ${AGENT_CONFIG_DIR}/buildkite-agent.cfg + +# Restart on failure +Restart=always +RestartSec=5 + +# Resource isolation via cgroup slice +Slice=buildkite-gpu${GPU_INDEX}.slice + +# Hardening +NoNewPrivileges=false +ProtectSystem=full +ProtectHome=read-only + +[Install] +WantedBy=multi-user.target +EOF + +# Create environment file for token if it doesn't exist +if [[ ! -f /etc/buildkite-agent/token ]]; then + cat > /etc/buildkite-agent/token << EOF +# Buildkite agent token - set this to your organization's token +# Get it from: https://buildkite.com/organizations//agents +BUILDKITE_AGENT_TOKEN= +EOF + chmod 600 /etc/buildkite-agent/token + echo "" + echo "⚠️ IMPORTANT: Set your Buildkite agent token in /etc/buildkite-agent/token" +fi + +# Reload systemd and enable the service +systemctl daemon-reload +systemctl enable "buildkite-agent-gpu${GPU_INDEX}.service" + +echo "" +echo "✅ Buildkite agent for GPU $GPU_INDEX configured successfully!" +echo "" +echo "Next steps:" +echo " 1. Set BUILDKITE_AGENT_TOKEN in /etc/buildkite-agent/token" +echo " 2. Start the agent: sudo systemctl start buildkite-agent-gpu${GPU_INDEX}" +echo " 3. Check status: sudo systemctl status buildkite-agent-gpu${GPU_INDEX}" +echo " 4. View logs: sudo journalctl -u buildkite-agent-gpu${GPU_INDEX} -f" +echo "" +echo "To set up additional GPUs, run this script with different GPU indices." diff --git a/scripts/buildkite/update-image.sh b/scripts/buildkite/update-image.sh new file mode 100755 index 00000000..402a91e1 --- /dev/null +++ b/scripts/buildkite/update-image.sh @@ -0,0 +1,50 @@ +#!/bin/bash +# +# Update Kernelbot Runner Image +# +# This script pulls the latest runner image and restarts agents to use it. +# Run this when notified of a new image release. +# +# Usage: +# sudo ./update-image.sh +# + +set -euo pipefail + +IMAGE="ghcr.io/gpu-mode/kernelbot-runner:latest" + +echo "Pulling latest kernelbot runner image..." +docker pull "$IMAGE" + +echo "" +echo "Image updated. Checking for running agents..." + +# Find all buildkite-agent-gpu* services +AGENTS=$(systemctl list-units --type=service --state=running --no-legend | grep 'buildkite-agent-gpu' | awk '{print $1}' || true) + +if [[ -z "$AGENTS" ]]; then + echo "No running Buildkite GPU agents found." + echo "Image will be used on next job run." +else + echo "Found running agents:" + echo "$AGENTS" + echo "" + read -p "Restart agents to use new image? (y/N) " -n 1 -r + echo "" + if [[ $REPLY =~ ^[Yy]$ ]]; then + for agent in $AGENTS; do + echo "Restarting $agent..." + systemctl restart "$agent" + done + echo "" + echo "✅ All agents restarted with new image." + else + echo "Agents will use new image on next job run." + fi +fi + +echo "" +echo "Current image info:" +docker inspect "$IMAGE" --format='ID: {{.Id}}' +docker inspect "$IMAGE" --format='Created: {{.Created}}' +docker inspect "$IMAGE" --format='Labels: {{json .Config.Labels}}' diff --git a/src/discord_cluster_manager.egg-info/PKG-INFO b/src/discord_cluster_manager.egg-info/PKG-INFO new file mode 100644 index 00000000..1e9fe5a0 --- /dev/null +++ b/src/discord_cluster_manager.egg-info/PKG-INFO @@ -0,0 +1,407 @@ +Metadata-Version: 2.4 +Name: discord-cluster-manager +Version: 0.1.0 +Summary: Discord bot for managing compute clusters and running kernel benchmarks +Requires-Python: >=3.10 +Description-Content-Type: text/markdown +License-File: LICENSE +Requires-Dist: PyGithub +Requires-Dist: aiohttp +Requires-Dist: discord.py +Requires-Dist: audioop-lts; python_version >= "3.13" +Requires-Dist: python-dotenv +Requires-Dist: requests +Requires-Dist: modal +Requires-Dist: psycopg2-binary +Requires-Dist: yoyo-migrations +Requires-Dist: better_profanity +Requires-Dist: PyYAML +Requires-Dist: fastapi[all] +Requires-Dist: uvicorn +Requires-Dist: jinja2 +Provides-Extra: dev +Requires-Dist: ruff; extra == "dev" +Requires-Dist: pre-commit; extra == "dev" +Requires-Dist: pytest; extra == "dev" +Requires-Dist: pytest-coverage; extra == "dev" +Requires-Dist: pytest-asyncio; extra == "dev" +Dynamic: license-file + +# KernelBot + +[![nvidia-on-prem](https://github.com/gpu-mode/discord-cluster-manager/actions/workflows/nvidia-on-prem-health.yml/badge.svg)](https://github.com/gpu-mode/discord-cluster-manager/actions/workflows/nvidia-on-prem-health.yml) + +This is the code for the Discord bot we'll be using to queue jobs to a cluster of GPUs that our generous sponsors have provided. Our goal is to be able to queue kernels that can run end to end in seconds that way things feel interactive and social. + +The key idea is that we're using Github Actions as a job scheduling engine and primarily making the Discord bot interact with the cluster via issuing Github Actions and and monitoring their status and while we're focused on having a nice user experience on discord.gg/gpumode, [we're happy to accept PRs](#local-development) that make it easier for other Discord communities to hook GPUs. + +## Table of Contents + +- [Supported Schedulers](#supported-schedulers) +- [Local Development](#local-development) + - [Clone Repository](#clone-repository) + - [Discord Bot](#discord-bot) + - [Database](#database) + - [Environment Variables](#environment-variables) + - [Verify Setup](#verify-setup) +- [Available Commands](#available-commands) +- [Using the Leaderboard](#using-the-leaderboard) + - [Creating a New Leaderboard](#creating-a-new-leaderboard) + - [Reference Code Requirements (Python)](#reference-code-requirements-python) + - [Reference Code Requirements (CUDA)](#reference-code-requirements-cuda) + - [Submitting to a Leaderboard](#submitting-to-a-leaderboard) + - [Other Available Leaderboard Commands](#other-available-leaderboard-commands) + - [GPU Kernel-Specific Commands](#gpu-kernel-specific-commands) +- [Testing the Discord Bot](#testing-the-discord-bot) +- [How to Add a New GPU to the Cluster](#how-to-add-a-new-gpu-to-the-cluster) +- [Acknowledgements](#acknowledgements) + +## Supported schedulers + +- GitHub Actions +- Modal +- Slurm (not implemented yet) + +## Local Development + +### Clone Repository + +> [!IMPORTANT] +> Do not fork this repository. Instead, directly clone this repository to your local machine. + +> [!IMPORTANT] +> Python 3.11 or higher is required. + +After, install the dependencies with `pip install -r requirements-dev.txt`. + +### Setup Discord Bot + +To run and develop the bot locally, you need to add it to your own "staging" server. Follow the steps [here](https://discordjs.guide/preparations/setting-up-a-bot-application.html#creating-your-bot) and [here](https://discordjs.guide/preparations/adding-your-bot-to-servers.html#bot-invite-links) to create a bot application and then add it to your staging server. + +Below is a visual walk-through of the steps linked above: + +- The bot needs the `Message Content Intent` and `Server Members Intent` permissions turned on. +
+ Click here for visual. + DCS_bot_perms +
+ +- The bot needs `applications.commands` and `bot` scopes. + +
+ Click here for visual. + Screenshot 2024-11-24 at 12 34 09 PM +
+ +- Finally, generate an invite link for the bot and enter it into any browser. + +
+ Click here for visual. + Screenshot 2024-11-24 at 12 44 08 PM +
+ +> [!NOTE] +> Bot permissions involving threads/mentions/messages should suffice, but you can naively give it `Administrator` since it's just a test bot in your own testing Discord server. + +### Database + +The leaderboard persists information in a Postgres database. To develop locally, set Postgres up on your machine. Then start a Postgres shell with `psql`, and create a database: + +``` +$ psql -U postgres +Password for user postgres: ******** +psql (16.6 (Ubuntu 16.6-1.pgdg22.04+1)) +Type "help" for help. + +postgres=# CREATE DATABASE clusterdev; +``` + +We are using [Yoyo Migrations](https://ollycope.com/software/yoyo/) to manage tables, indexes, etc. in our database. To create tables in your local database, apply the migrations in `src/discord-cluster-manager/migrations` with the following command line: + +``` +yoyo apply src/migrations \ + -d postgresql://user:password@localhost/clusterdev +``` + +
+ Click here for a transcript of a yoyo apply session + + $ yoyo apply . -d postgresql://user:password@localhost/clusterdev + + [20241208_01_p3yuR-initial-leaderboard-schema] + Shall I apply this migration? [Ynvdaqjk?]: y + + Selected 1 migration: + [20241208_01_p3yuR-initial-leaderboard-schema] + Apply this migration to postgresql://user:password@localhost/clusterdev [Yn]: y + Save migration configuration to yoyo.ini? + This is saved in plain text and contains your database password. + + Answering 'y' means you do not have to specify the migration source or database connection for future runs [yn]: n + +
+ +Applying migrations to our staging and prod environments also happens using `yoyo apply`, just with a different database URL. + +To make changes to the structure of the database, create a new migration: + +``` +yoyo new src/discord-cluster-manager/migrations -m "short_description" +``` + +...and then edit the generated file. Please do not edit existing migration files: the existing migration files form a sort of changelog that is supposed to be immutable, and so yoyo will refuse to reapply the changes. + +We are following an expand/migrate/contract pattern to allow database migrations without downtime. When you want to make a change to the structure of the database, first determine if it is expansive or contractive. + +- _Expansive changes_ are those that have no possibility of breaking a running application. Examples include: adding a new nullable column, adding a non-null column with a default value, adding an index, adding a table, etc. +- _Contractive changes_ are those that could break a running application. Examples include: dropping a table, dropping a column, adding a not null constraint to a column, adding a unique index, etc. + +After an expansive phase, data gets migrated to the newly added elements. Code also begins using the newly added elements. This is the migration step. Finally, when all code is no longer using elements that are obsolete, these can be removed. (Or, if adding a unique or not null constraint, after checking that the data satisfies the constraint, then the constraint can be safely added.) + +Expand, migrate, and contract steps may all be written using yoyo. + +### Environment Variables + +Create a `.env` file with the following environment variables: + +- `DISCORD_DEBUG_TOKEN` : The token of the bot you want to run locally +- `DISCORD_TOKEN` : The token of the bot you want to run in production +- `DISCORD_DEBUG_CLUSTER_STAGING_ID` : The ID of the "staging" server you want to connect to +- `DISCORD_CLUSTER_STAGING_ID` : The ID of the "production" server you want to connect to +- `GITHUB_TOKEN` : A Github token with permissions to trigger workflows, for now only new branches from [discord-cluster-manager](https://github.com/gpu-mode/discord-cluster-manager) are tested, since the bot triggers workflows on your behalf +- `GITHUB_REPO` : The repository where the cluster manager is hosted. +- `GITHUB_WORKFLOW_BRANCH` : The branch to start the GitHub Actions jobs from when submitting a task. +- `DATABASE_URL` : The URL you use to connect to Postgres. +- `DISABLE_SSL` : (Optional) set if you want to disable SSL when connecting to Postgres. + +Below is where to find these environment variables: + +> [!NOTE] +> For now, you can naively set `DISCORD_DEBUG_TOKEN` and `DISCORD_DEBUG_CLUSTER_STAGING_ID` to the same values as `DISCORD_TOKEN` and `DISCORD_CLUSTER_STAGING_ID` respectively. + +- `DISCORD_DEBUG_TOKEN` or `DISCORD_TOKEN`: Found in your bot's page within the [Discord Developer Portal](https://discord.com/developers/applications/): + +
+ Click here for visual. + Screenshot 2024-11-24 at 11 01 19 AM +
+ +- `DISCORD_DEBUG_CLUSTER_STAGING_ID` or `DISCORD_CLUSTER_STAGING_ID`: Right-click your staging Discord server and select `Copy Server ID`: + +
+ Click here for visual. + Screenshot 2024-11-24 at 10 58 27 AM +
+ +- `GITHUB_TOKEN`: Found in Settings -> Developer Settings (or [here](https://github.com/settings/tokens?type=beta)). Create a new (preferably classic) personal access token with an expiration date to any day less than a year from the current date, and the scopes `repo` and `workflow`. + +
+ Click here for visual. + Screenshot 2024-12-30 at 8 51 59 AM +
+ +- `GITHUB_REPO`: This should be set to this repository, which is usually `gpu-mode/discord-cluster-manager`. + +- `GITHUB_WORKFLOW_BRANCH`: Usually `main` or the branch you are working from. + +- `DATABASE_URL`: This contains the connection details for your local database, and has the form `postgresql://user:password@localhost/clusterdev`. + +- `DISABLE_SSL`: Set to `1` when developing. + +### Verify Setup + +Install the kernel bot as editable using `pip install -e .` + +Run the following command to run the bot: + +``` +python src/kernelbot/main.py --debug +``` + +Then in your staging server, use the `/verifyruns` command to test basic functionalities of the bot and the `/verifydb` command to check database connectivity. + +> [!NOTE] +> To test functionality of the Modal runner, you also need to be authenticated with Modal. Modal provides free credits to get started. +> To test functionality of the GitHub runner, you may need direct access to this repo which you can ping us for. + +## Available Commands + +TODO. This is currently a work in progress. + +`/run modal ` which you can use to pick a specific gpu, right now defaults to T4 + +`/run github ` which picks one of two workflow files + +`/resync` to clear all the commands and resync them + +`/ping` to check if the bot is online + +## Using the Leaderboard + +The main purpose of the Discord bot is to allow servers to host coding competitions through Discord. +The leaderboard was designed for evaluating GPU kernels, but can be adapted easily for other +competitions. The rest of this section will mostly refer to leaderboard submissions in the context +of our GPU Kernel competition. + +> [!NOTE] +> All leaderboard commands have the prefix `/leaderboard`, and center around creating, submitting to, +> and viewing leaderboard statistics and information. + +### Creating a new Leaderboard + +``` +/leaderboard create {name: str} {deadline: str} {reference_code: .cu or .py file} +``` + +The above command creates a leaderboard named `name` that ends at `deadline`. The `reference_code` +has strict function signature requirements, and is required to contain an input generator and a +reference implementation for the desired GPU kernel. We import these functions in our evaluation +scripts for verifying leaderboard submissions and measuring runtime. In the next mini-section, we +discuss the exact requirements for the `reference_code` script. + +Each leaderboard `name` can also specify the types of hardware that users can run their kernels on. +For example, a softmax kernel on an RTX 4090 can have different performance characteristics on an +H100. After running the leaderboard creation command, a prompt will pop up where the creator can +specify the available GPUs that the leaderboard evaluates on. + +![Leaderboard GPU](assets/img/lb_gpu.png) + +#### Reference Code Requirements (Python) + +The Discord bot internally contains an `eval.py` script that handles the correctness and timing +analysis for the leaderboard. The `reference_code` that the leaderboard creator submits must have +the following function signatures with their implementations filled out. `InputType` and +`OutputType` are generics that could be a `torch.Tensor`, `List[torch.Tensor]`, etc. +depending on the reference code specifications. We leave this flexibility to the leaderboard creator. + +```python +# Reference kernel implementation. +def ref_kernel(input: InputType) -> OutputType: + # Implement me... + +# Generate a list of tensors as input to the kernel +def generate_input() -> InputType: + # Implement me... + +# Verify correctness of reference and output +def check_implementation(custom_out: OutputType, reference_out: OutputType) -> bool: + # Implement me... +``` + +#### Reference Code Requirements (CUDA) + +The Discord bot internally contains an `eval.cu` script that handles the correctness and timing +analysis for the leaderboard. The difficult of CUDA evaluation scripts is we need to explicitly +handle the typing system for tensors. The `reference.cu` that the leaderboard creator submits must have +the following function signatures with their implementations filled out: + +The main difference is we now need to define an alias for the type that the input / outputs are. A +simple and common example is a list of FP32 tensors, which can be defined using a pre-defined array of +`const int`s called `N_SIZES`, then define an array of containers, e.g. +`std::array, N_SIZES>`. + +```cuda +// User-defined type for inputs, e.g. using input_t = std::array, IN_SIZES>; +using input_t = ...; + +// User-defined type for outputs, e.g. using output_t = std::array, OUT_SIZES>; +using output_t = ...; + +// Generate random data of type input_t +input_t generate_input() { + // Implement me... +} + + +// Reference kernel host code. +output_t reference(input_t data) { + // Implement me... +} + + +// Verify correctness of reference and output +bool check_implementation(output_t out, output_t ref) { + // Implement me... +} +``` + +### Submitting to a Leaderboard + +``` +/leaderboard submit {github / modal} {leaderboard_name: str} {script: .cu or .py file} +``` + +The leaderboard submission for _Python code_ requires the following function signatures, where +`InputType` and `OutputType` are generics that could be a `torch.Tensor`, `List[torch.Tensor]`, +etc. depending on the reference code specifications. + +```python +# User kernel implementation. +def custom_kernel(input: InputType) -> OutputType: + # Implement me... +``` + +### Other Available Leaderboard Commands + +Deleting a leaderboard: + +``` +/leaderboard delete {name: str} +``` + +List all active leaderboards and which GPUs they can run on: + +``` +/leaderboard list +``` + +List all leaderboard scores (runtime) for a particular leaderboard. (currently deprecated. Doesn't +support multiple GPU types yet) + +``` +/leaderboard show {name: str} +``` + +Display all personal scores (runtime) from a specific leaderboard. + +``` +/leaderboard show-personal {name: str} +``` + +### Submitting via a CLI + +Moving forward we also allow submissions without logging in to Discord via a CLI tool we wrote in Rust https://github.com/gpu-mode/popcorn-cli + +#### GPU Kernel-specific Commands + +We plan to add support for the PyTorch profiler and CUDA NSight Compute CLI to allow users to +profile their kernels. These commands are not specific to the leaderboard, but may be helpful for +leaderboard submissions. + +## How to add a new GPU to the cluster + +If you'd like to donate a GPU to our efforts, we can make you a CI admin in Github and have you add an org level runner https://github.com/organizations/gpu-mode/settings/actions/runners + +## Acknowledgements + +- Thank you to AMD for sponsoring an MI250 node +- Thank you to NVIDIA for sponsoring an H100 node +- Thank you to Nebius for sponsoring credits and an H100 node +- Thank you Modal for credits and speedy spartup times +- Luca Antiga did something very similar for the NeurIPS LLM efficiency competition, it was great! +- Midjourney was a similar inspiration in terms of UX + +## Citation +If you used our software please cite it as + +``` +@inproceedings{ + kernelbot2025, + title={KernelBot: A Competition Platform for Writing Heterogeneous {GPU} Code}, + author={Alex L Zhang and Matej Sirovatka and Erik Schultheis and Benjamin Horowitz and Mark Saroufim}, + note={Equal Contribution}, + booktitle={Championing Open-source Development in ML Workshop @ ICML25}, + year={2025}, + url={https://openreview.net/forum?id=bq9U4dmuyJ} +} +``` diff --git a/src/discord_cluster_manager.egg-info/SOURCES.txt b/src/discord_cluster_manager.egg-info/SOURCES.txt new file mode 100644 index 00000000..06f31a50 --- /dev/null +++ b/src/discord_cluster_manager.egg-info/SOURCES.txt @@ -0,0 +1,71 @@ +LICENSE +README.md +pyproject.toml +src/discord_cluster_manager.egg-info/PKG-INFO +src/discord_cluster_manager.egg-info/SOURCES.txt +src/discord_cluster_manager.egg-info/dependency_links.txt +src/discord_cluster_manager.egg-info/requires.txt +src/discord_cluster_manager.egg-info/top_level.txt +src/kernelbot/discord_reporter.py +src/kernelbot/discord_utils.py +src/kernelbot/env.py +src/kernelbot/main.py +src/kernelbot/api/__init__.py +src/kernelbot/api/api_utils.py +src/kernelbot/api/main.py +src/kernelbot/cogs/__init__.py +src/kernelbot/cogs/admin_cog.py +src/kernelbot/cogs/leaderboard_cog.py +src/kernelbot/cogs/misc_cog.py +src/kernelbot/cogs/verify_run_cog.py +src/kernelbot/ui/misc.py +src/kernelbot/ui/table.py +src/libkernelbot/__init__.py +src/libkernelbot/backend.py +src/libkernelbot/background_submission_manager.py +src/libkernelbot/consts.py +src/libkernelbot/db_types.py +src/libkernelbot/leaderboard_db.py +src/libkernelbot/report.py +src/libkernelbot/run_eval.py +src/libkernelbot/submission.py +src/libkernelbot/task.py +src/libkernelbot/utils.py +src/libkernelbot/launchers/__init__.py +src/libkernelbot/launchers/github.py +src/libkernelbot/launchers/launcher.py +src/libkernelbot/launchers/modal.py +src/migrations/20241208_01_p3yuR-initial-leaderboard-schema.py +src/migrations/20241214_01_M62BX-drop-old-leaderboard-tables.py +src/migrations/20241221_01_54Oeg-rename-problem-table.py +src/migrations/20241222_01_ELxU5-add-gpu-types.py +src/migrations/20241224_01_Pg4FX-delete-cascade.py +src/migrations/20241226_01_ZQSOK-add_gpu_type_to_submission.py +src/migrations/20250106_01_Sgph3-add-leaderboard-creator-id.py +src/migrations/20250202_01_YYS3Q-leaderboard-rename-reference-to-task.py +src/migrations/20250221_01_GA8ro-submission-collection.py +src/migrations/20250228_01_9ANYn-submission-add-user-name.py +src/migrations/20250304_01_DzORz-collect-system-information-for-each-run.py +src/migrations/20250316_01_5oMi3-remember-forum-id.py +src/migrations/20250329_01_7VjJJ-add-a-secret-seed-column.py +src/migrations/20250406_01_ZXjWK-user-info-add-cli-id.py +src/migrations/20250412_01_l7Dra-user-info-fix-auth.py +src/migrations/20250412_02_NN9kK-user-info-cli-drop-old.py +src/migrations/20250506_01_38PkG-add-index-on-runs-runner-score.py +src/migrations/20250617_01_c5mrF-task-split.py +src/migrations/20250728_01_Q3jso-fix-code-table.py +src/migrations/20250822_01_UtXzl-website-submission.py +src/migrations/20251106_01_kOjGy-draft-code-editor.py +src/migrations/20260108_01_gzSm3-add-submission-status.py +src/runners/github-runner.py +src/runners/modal_runner.py +src/runners/modal_runner_archs.py +tests/test_backend.py +tests/test_background_submission_manager.py +tests/test_github.py +tests/test_leaderboard_db.py +tests/test_modal.py +tests/test_report.py +tests/test_submission.py +tests/test_task.py +tests/test_utils.py \ No newline at end of file diff --git a/src/discord_cluster_manager.egg-info/dependency_links.txt b/src/discord_cluster_manager.egg-info/dependency_links.txt new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/src/discord_cluster_manager.egg-info/dependency_links.txt @@ -0,0 +1 @@ + diff --git a/src/discord_cluster_manager.egg-info/requires.txt b/src/discord_cluster_manager.egg-info/requires.txt new file mode 100644 index 00000000..eb2fb39a --- /dev/null +++ b/src/discord_cluster_manager.egg-info/requires.txt @@ -0,0 +1,23 @@ +PyGithub +aiohttp +discord.py +python-dotenv +requests +modal +psycopg2-binary +yoyo-migrations +better_profanity +PyYAML +fastapi[all] +uvicorn +jinja2 + +[:python_version >= "3.13"] +audioop-lts + +[dev] +ruff +pre-commit +pytest +pytest-coverage +pytest-asyncio diff --git a/src/discord_cluster_manager.egg-info/top_level.txt b/src/discord_cluster_manager.egg-info/top_level.txt new file mode 100644 index 00000000..e90efa46 --- /dev/null +++ b/src/discord_cluster_manager.egg-info/top_level.txt @@ -0,0 +1,4 @@ +kernelbot +libkernelbot +migrations +runners diff --git a/src/kernelbot/env.py b/src/kernelbot/env.py index b1758b63..4e0e3bbc 100644 --- a/src/kernelbot/env.py +++ b/src/kernelbot/env.py @@ -47,3 +47,8 @@ def init_environment(): # PostgreSQL-specific constants env.DATABASE_URL = os.getenv("DATABASE_URL") env.DISABLE_SSL = os.getenv("DISABLE_SSL") + +# Buildkite-specific constants (optional - for vendor-managed GPU runners) +env.BUILDKITE_API_TOKEN = os.getenv("BUILDKITE_API_TOKEN") +env.BUILDKITE_ORG = os.getenv("BUILDKITE_ORG", "gpu-mode") +env.BUILDKITE_PIPELINE = os.getenv("BUILDKITE_PIPELINE", "kernelbot-runner") diff --git a/src/kernelbot/main.py b/src/kernelbot/main.py index e0411096..2aff46d2 100644 --- a/src/kernelbot/main.py +++ b/src/kernelbot/main.py @@ -16,7 +16,7 @@ from libkernelbot import consts from libkernelbot.backend import KernelBackend from libkernelbot.background_submission_manager import BackgroundSubmissionManager -from libkernelbot.launchers import GitHubLauncher, ModalLauncher +from libkernelbot.launchers import BuildkiteLauncher, GitHubLauncher, ModalLauncher from libkernelbot.utils import setup_logging logger = setup_logging(__name__) @@ -43,6 +43,15 @@ def __init__(self, debug_mode=False): self.backend.register_launcher( GitHubLauncher(env.GITHUB_REPO, env.GITHUB_TOKEN, env.GITHUB_WORKFLOW_BRANCH) ) + # Register Buildkite launcher if configured (optional - for vendor-managed GPU runners) + if env.BUILDKITE_API_TOKEN: + self.backend.register_launcher( + BuildkiteLauncher( + org=env.BUILDKITE_ORG, + pipeline=env.BUILDKITE_PIPELINE, + token=env.BUILDKITE_API_TOKEN, + ) + ) @property def leaderboard_db(self): diff --git a/src/libkernelbot/consts.py b/src/libkernelbot/consts.py index f60764de..e5beabee 100644 --- a/src/libkernelbot/consts.py +++ b/src/libkernelbot/consts.py @@ -14,6 +14,7 @@ class SchedulerType(Enum): GITHUB = "github" MODAL = "modal" SLURM = "slurm" + BUILDKITE = "buildkite" class GitHubGPU(Enum): @@ -23,6 +24,19 @@ class GitHubGPU(Enum): MI300x8 = "MI300x8" +class BuildkiteGPU(Enum): + # Queue naming: {vendor}-{gpu_type} + # Buildkite agents use tags like queue=nvidia-h100-0 for per-GPU routing + # The enum value is the queue prefix; agents append -N for specific GPU index + NVIDIA_H100 = "nvidia-h100" + NVIDIA_B200 = "nvidia-b200" + NVIDIA_A100 = "nvidia-a100" + AMD_MI300 = "amd-mi300" + AMD_MI250 = "amd-mi250" + GOOGLE_TPU = "google-tpu" + NEBIUS_H100 = "nebius-h100" + + class ModalGPU(Enum): T4 = "T4" L4 = "L4" @@ -50,7 +64,7 @@ def _make_gpu_lookup(runner_map: dict[str, Type[Enum]]): return lookup -_GPU_LOOKUP = _make_gpu_lookup({"Modal": ModalGPU, "GitHub": GitHubGPU}) +_GPU_LOOKUP = _make_gpu_lookup({"Modal": ModalGPU, "GitHub": GitHubGPU, "Buildkite": BuildkiteGPU}) def get_gpu_by_name(name: str) -> GPU: diff --git a/src/libkernelbot/launchers/__init__.py b/src/libkernelbot/launchers/__init__.py index df47476f..1a7a8a39 100644 --- a/src/libkernelbot/launchers/__init__.py +++ b/src/libkernelbot/launchers/__init__.py @@ -1,5 +1,6 @@ +from .buildkite import BuildkiteLauncher from .github import GitHubLauncher from .launcher import Launcher from .modal import ModalLauncher -__all__ = [Launcher, GitHubLauncher, ModalLauncher] +__all__ = [Launcher, GitHubLauncher, ModalLauncher, BuildkiteLauncher] diff --git a/src/libkernelbot/launchers/buildkite.py b/src/libkernelbot/launchers/buildkite.py new file mode 100644 index 00000000..78aa7b09 --- /dev/null +++ b/src/libkernelbot/launchers/buildkite.py @@ -0,0 +1,300 @@ +import asyncio +import base64 +import datetime +import json +import math +import zlib +from typing import Awaitable, Callable + +import requests + +from libkernelbot.consts import ( + DEFAULT_GITHUB_TIMEOUT_MINUTES, + GPU, + TIMEOUT_BUFFER_MINUTES, + BuildkiteGPU, + SubmissionMode, +) +from libkernelbot.report import RunProgressReporter +from libkernelbot.run_eval import ( + CompileResult, + EvalResult, + FullResult, + ProfileResult, + RunResult, + SystemInfo, +) +from libkernelbot.utils import setup_logging + +from .launcher import Launcher + +logger = setup_logging() + +# Buildkite API base URL +BUILDKITE_API_BASE = "https://api.buildkite.com/v2" + + +def get_timeout(config: dict) -> int: + """Get timeout in minutes from config, matching GitHub launcher pattern.""" + mode = config.get("mode") + sec_map = { + SubmissionMode.TEST.value: config.get("test_timeout"), + SubmissionMode.BENCHMARK.value: config.get("benchmark_timeout"), + SubmissionMode.LEADERBOARD.value: config.get("ranked_timeout"), + } + seconds = sec_map.get(mode) or DEFAULT_GITHUB_TIMEOUT_MINUTES * 60 + return math.ceil(seconds / 60) + + +class BuildkiteLauncher(Launcher): + """ + Launcher for Buildkite-based GPU runners. + + Buildkite agents are configured per-GPU with isolated resources: + - Each agent bound to single GPU via CUDA_VISIBLE_DEVICES + - CPU/RAM limits enforced via systemd cgroups + - Queue tags route jobs to specific GPU types (e.g., queue=nvidia-h100-0) + """ + + def __init__(self, org: str, pipeline: str, token: str): + """ + Initialize Buildkite launcher. + + Args: + org: Buildkite organization slug (e.g., "gpu-mode") + pipeline: Pipeline slug (e.g., "kernelbot-runner") + token: Buildkite API token with build creation permissions + """ + super().__init__(name="Buildkite", gpus=BuildkiteGPU) + self.org = org + self.pipeline = pipeline + self.token = token + self._headers = { + "Authorization": f"Bearer {token}", + "Content-Type": "application/json", + } + + async def run_submission( + self, config: dict, gpu_type: GPU, status: RunProgressReporter + ) -> FullResult: + """ + Run a submission on a Buildkite agent. + + Args: + config: Submission configuration dict + gpu_type: GPU type to run on (determines queue routing) + status: Progress reporter for user feedback + + Returns: + FullResult with compilation and run results + """ + # Compress config (same as GitHub launcher) + payload = base64.b64encode(zlib.compress(json.dumps(config).encode("utf-8"))).decode( + "utf-8" + ) + + # Create build via Buildkite API + build_url = f"{BUILDKITE_API_BASE}/organizations/{self.org}/pipelines/{self.pipeline}/builds" + + # Queue name from GPU type value (e.g., "nvidia-h100") + # Buildkite will route to any agent with matching queue tag + queue_name = gpu_type.value + + build_data = { + "commit": "HEAD", + "branch": "main", + "message": f"Kernel submission on {gpu_type.name}", + "env": { + "SUBMISSION_PAYLOAD": payload, + "GPU_QUEUE": queue_name, + }, + } + + logger.info(f"Creating Buildkite build for {gpu_type.name} on queue {queue_name}") + + try: + response = await asyncio.to_thread( + requests.post, build_url, headers=self._headers, json=build_data + ) + response.raise_for_status() + except requests.RequestException as e: + logger.error(f"Failed to create Buildkite build: {e}") + return FullResult( + success=False, + error=f"Failed to create Buildkite build: {str(e)}", + runs={}, + system=SystemInfo(), + ) + + build = response.json() + build_number = build["number"] + build_url_html = build["web_url"] + + logger.info(f"Created Buildkite build #{build_number}: {build_url_html}") + await status.push(f"⏳ Buildkite build [#{build_number}](<{build_url_html}>) started...") + + # Poll for completion + timeout = get_timeout(config) + TIMEOUT_BUFFER_MINUTES + build_api_url = f"{BUILDKITE_API_BASE}/organizations/{self.org}/pipelines/{self.pipeline}/builds/{build_number}" + + try: + await self._wait_for_completion( + build_api_url, + build_number, + build_url_html, + timeout, + lambda state, elapsed: self._status_callback( + status, build_number, build_url_html, state, elapsed + ), + ) + except TimeoutError as e: + logger.error(f"Buildkite build #{build_number} timed out") + return FullResult( + success=False, + error=str(e), + runs={}, + system=SystemInfo(), + ) + except Exception as e: + logger.error(f"Error waiting for Buildkite build: {e}") + return FullResult( + success=False, + error=f"Build error: {str(e)}", + runs={}, + system=SystemInfo(), + ) + + await status.update(f"✅ Build [#{build_number}](<{build_url_html}>) completed") + + # Download artifacts + await status.push("Downloading artifacts...") + logger.info(f"Downloading artifacts for build #{build_number}") + + try: + result = await self._download_and_parse_result(build_api_url) + await status.update("Downloading artifacts... done") + return result + except Exception as e: + logger.error(f"Failed to download artifacts: {e}") + await status.update("Downloading artifacts... failed") + return FullResult( + success=False, + error=f"Failed to download artifacts: {str(e)}", + runs={}, + system=SystemInfo(), + ) + + async def _wait_for_completion( + self, + build_api_url: str, + build_number: int, + build_url_html: str, + timeout_minutes: int, + callback: Callable[[str, float], Awaitable[None]], + ): + """Poll Buildkite API until build completes or times out.""" + start_time = datetime.datetime.now(datetime.timezone.utc) + timeout = datetime.timedelta(minutes=timeout_minutes) + + while True: + try: + response = await asyncio.to_thread( + requests.get, build_api_url, headers=self._headers + ) + response.raise_for_status() + build = response.json() + + elapsed = (datetime.datetime.now(datetime.timezone.utc) - start_time).total_seconds() + + if elapsed > timeout.total_seconds(): + # Try to cancel the build + cancel_url = f"{build_api_url}/cancel" + await asyncio.to_thread( + requests.put, cancel_url, headers=self._headers + ) + raise TimeoutError( + f"Build #{build_number} cancelled - exceeded {timeout_minutes} minute timeout" + ) + + state = build.get("state", "unknown") + + if state in ("passed", "failed", "canceled", "blocked"): + if state != "passed": + logger.warning(f"Build #{build_number} finished with state: {state}") + return + + await callback(state, elapsed) + await asyncio.sleep(10) # Poll every 10 seconds + + except TimeoutError: + raise + except Exception as e: + logger.error(f"Error polling build status: {e}") + raise + + async def _status_callback( + self, + status: RunProgressReporter, + build_number: int, + build_url_html: str, + state: str, + elapsed: float, + ): + """Update status with current build state.""" + await status.update( + f"⏳ Build [#{build_number}](<{build_url_html}>): {state} ({elapsed:.1f}s)" + ) + + async def _download_and_parse_result(self, build_api_url: str) -> FullResult: + """Download artifacts and parse result.json.""" + # Get artifacts list + artifacts_url = f"{build_api_url}/artifacts" + response = await asyncio.to_thread( + requests.get, artifacts_url, headers=self._headers + ) + response.raise_for_status() + artifacts = response.json() + + # Find result.json artifact + result_artifact = None + profile_artifact = None + for artifact in artifacts: + if artifact.get("filename") == "result.json": + result_artifact = artifact + elif artifact.get("path", "").startswith("profile_data/"): + profile_artifact = artifact + + if not result_artifact: + raise RuntimeError("Could not find result.json artifact") + + # Download result.json + download_url = result_artifact.get("download_url") + response = await asyncio.to_thread( + requests.get, download_url, headers=self._headers + ) + response.raise_for_status() + + # Parse result + data = response.json() + runs = {} + + for k, v in data.get("runs", {}).items(): + comp_res = None if v.get("compilation") is None else CompileResult(**v["compilation"]) + run_res = None if v.get("run") is None else RunResult(**v["run"]) + profile_res = None if v.get("profile") is None else ProfileResult(**v["profile"]) + + # Add profile download URL if available + if profile_res is not None and profile_artifact: + profile_res.download_url = profile_artifact.get("download_url") + + res = EvalResult( + start=datetime.datetime.fromisoformat(v["start"]), + end=datetime.datetime.fromisoformat(v["end"]), + compilation=comp_res, + run=run_res, + profile=profile_res, + ) + runs[k] = res + + system = SystemInfo(**data.get("system", {})) + return FullResult(success=True, error="", runs=runs, system=system) diff --git a/src/runners/buildkite-runner.py b/src/runners/buildkite-runner.py new file mode 100644 index 00000000..9bae67fb --- /dev/null +++ b/src/runners/buildkite-runner.py @@ -0,0 +1,56 @@ +""" +Buildkite runner script for kernel submissions. + +This script runs inside a Docker container on Buildkite agents. +It reads the submission payload from the SUBMISSION_PAYLOAD environment variable, +executes the kernel, and writes results to result.json for artifact upload. + +The agent is pre-configured with: +- CUDA_VISIBLE_DEVICES bound to a single GPU +- CPU/RAM limits via systemd cgroups +""" + +import base64 +import json +import os +import zlib +from dataclasses import asdict +from datetime import datetime +from pathlib import Path + +from libkernelbot.run_eval import run_config + + +def serialize(obj: object): + """JSON serializer for objects not serializable by default.""" + if isinstance(obj, datetime): + return obj.isoformat() + raise TypeError(f"Type {type(obj)} not serializable") + + +def main(): + # Get payload from environment variable + payload_b64 = os.environ.get("SUBMISSION_PAYLOAD") + if not payload_b64: + raise RuntimeError("SUBMISSION_PAYLOAD environment variable not set") + + # Decompress and parse config + payload = zlib.decompress(base64.b64decode(payload_b64)).decode("utf-8") + config = json.loads(payload) + + # Run the submission + result = run_config(config) + + # Write result to file for artifact upload + result_dict = asdict(result) + Path("result.json").write_text(json.dumps(result_dict, default=serialize)) + + # Create profile_data directory if profiling was enabled + # (profile artifacts will be written there by run_config) + profile_dir = Path("profile_data") + if profile_dir.exists(): + print(f"Profile data available in {profile_dir}") + + +if __name__ == "__main__": + main() diff --git a/tests/test_buildkite.py b/tests/test_buildkite.py new file mode 100644 index 00000000..82937dc4 --- /dev/null +++ b/tests/test_buildkite.py @@ -0,0 +1,192 @@ +"""Tests for BuildkiteLauncher.""" + +import base64 +import json +import zlib +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + +from libkernelbot.consts import BuildkiteGPU, GPU, SchedulerType, get_gpu_by_name +from libkernelbot.launchers import BuildkiteLauncher +from libkernelbot.report import RunProgressReporter + + +class MockProgressReporter(RunProgressReporter): + """Test progress reporter that captures messages.""" + + def __init__(self, title: str = "Test Buildkite Run"): + super().__init__(title) + self.messages = [] + self.updates = [] + + async def push(self, message: str): + self.messages.append(message) + + async def update(self, message: str): + self.updates.append(message) + + +class TestBuildkiteGPU: + """Tests for BuildkiteGPU enum.""" + + def test_enum_values(self): + """Test that BuildkiteGPU has expected values.""" + assert BuildkiteGPU.NVIDIA_H100.value == "nvidia-h100" + assert BuildkiteGPU.NVIDIA_B200.value == "nvidia-b200" + assert BuildkiteGPU.AMD_MI300.value == "amd-mi300" + assert BuildkiteGPU.GOOGLE_TPU.value == "google-tpu" + + def test_scheduler_type_exists(self): + """Test that BUILDKITE scheduler type exists.""" + assert SchedulerType.BUILDKITE.value == "buildkite" + + def test_gpu_lookup(self): + """Test that Buildkite GPUs are in the lookup table.""" + gpu = get_gpu_by_name("nvidia_h100") + assert gpu is not None + assert gpu.value == "nvidia-h100" + assert gpu.runner == "Buildkite" + + gpu = get_gpu_by_name("amd_mi300") + assert gpu is not None + assert gpu.value == "amd-mi300" + assert gpu.runner == "Buildkite" + + +class TestBuildkiteLauncher: + """Tests for BuildkiteLauncher class.""" + + @pytest.fixture + def launcher(self): + return BuildkiteLauncher( + org="test-org", + pipeline="test-pipeline", + token="test-token", + ) + + @pytest.fixture + def mock_config(self): + return { + "lang": "py", + "mode": "test", + "files": {"main.py": "print('hello')"}, + "tests": [], + "benchmarks": [], + "test_timeout": 180, + "benchmark_timeout": 180, + "ranked_timeout": 180, + } + + @pytest.fixture + def gpu_type(self): + return GPU(name="NVIDIA_H100", value="nvidia-h100", runner="Buildkite") + + def test_init(self, launcher): + """Test launcher initialization.""" + assert launcher.name == "Buildkite" + assert launcher.org == "test-org" + assert launcher.pipeline == "test-pipeline" + assert launcher.gpus == BuildkiteGPU + + def test_headers(self, launcher): + """Test API headers are set correctly.""" + assert "Authorization" in launcher._headers + assert launcher._headers["Authorization"] == "Bearer test-token" + assert launcher._headers["Content-Type"] == "application/json" + + def test_payload_compression(self, mock_config): + """Test that payload compression/decompression works.""" + # Compress (same logic as launcher) + payload = base64.b64encode( + zlib.compress(json.dumps(mock_config).encode("utf-8")) + ).decode("utf-8") + + # Decompress (same logic as runner) + decompressed = zlib.decompress(base64.b64decode(payload)).decode("utf-8") + restored = json.loads(decompressed) + + assert restored == mock_config + + @pytest.mark.asyncio + async def test_run_submission_creates_build(self, launcher, mock_config, gpu_type): + """Test that run_submission creates a Buildkite build.""" + mock_response = MagicMock() + mock_response.json.return_value = { + "number": 123, + "web_url": "https://buildkite.com/test/builds/123", + "state": "scheduled", + } + mock_response.raise_for_status = MagicMock() + + with patch("libkernelbot.launchers.buildkite.requests.post", return_value=mock_response) as mock_post: + with patch.object(launcher, "_wait_for_completion", new_callable=AsyncMock): + with patch.object(launcher, "_download_and_parse_result", new_callable=AsyncMock) as mock_download: + mock_download.return_value = MagicMock(success=True) + + reporter = MockProgressReporter() + result = await launcher.run_submission(mock_config, gpu_type, reporter) + + # Verify API was called + mock_post.assert_called_once() + call_args = mock_post.call_args + + # Check URL contains org and pipeline + url = call_args[0][0] + assert "test-org" in url + assert "test-pipeline" in url + + # Check payload was compressed and queue set + body = call_args[1]["json"] + assert "SUBMISSION_PAYLOAD" in body["env"] + assert body["env"]["GPU_QUEUE"] == "nvidia-h100" + + @pytest.mark.asyncio + async def test_run_submission_handles_api_error(self, launcher, mock_config, gpu_type): + """Test that API errors are handled gracefully.""" + import requests + + with patch("libkernelbot.launchers.buildkite.requests.post") as mock_post: + mock_post.side_effect = requests.RequestException("API Error") + + reporter = MockProgressReporter() + result = await launcher.run_submission(mock_config, gpu_type, reporter) + + assert result.success is False + assert "API Error" in result.error + + @pytest.mark.asyncio + async def test_status_updates(self, launcher, mock_config, gpu_type): + """Test that status updates are sent correctly.""" + mock_response = MagicMock() + mock_response.json.return_value = { + "number": 456, + "web_url": "https://buildkite.com/test/builds/456", + "state": "scheduled", + } + mock_response.raise_for_status = MagicMock() + + with patch("libkernelbot.launchers.buildkite.requests.post", return_value=mock_response): + with patch.object(launcher, "_wait_for_completion", new_callable=AsyncMock): + with patch.object(launcher, "_download_and_parse_result", new_callable=AsyncMock) as mock_download: + mock_download.return_value = MagicMock(success=True) + + reporter = MockProgressReporter() + await launcher.run_submission(mock_config, gpu_type, reporter) + + # Check status messages were sent + assert any("456" in msg for msg in reporter.messages) + assert any("completed" in msg.lower() for msg in reporter.updates) + + +class TestBuildkiteRunner: + """Tests for buildkite-runner.py script.""" + + def test_runner_script_syntax(self): + """Test that runner script has valid Python syntax.""" + import py_compile + from pathlib import Path + + runner_path = Path(__file__).parent.parent / "src" / "runners" / "buildkite-runner.py" + # This will raise SyntaxError if invalid + py_compile.compile(str(runner_path), doraise=True) From c446f98a9ed59195d4d6b2645616d1de7a8855ac Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Thu, 29 Jan 2026 10:33:06 -0800 Subject: [PATCH 2/4] update --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 5c184087..69c663da 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,5 @@ reference-kernels/ yoyo.ini .venv .claude/ +*.egg +*.egg-info/ From d11eafbacc82f00315ffb7039ab5e98b080dcab6 Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Thu, 29 Jan 2026 10:33:59 -0800 Subject: [PATCH 3/4] update --- src/discord_cluster_manager.egg-info/PKG-INFO | 407 ------------------ .../SOURCES.txt | 71 --- .../dependency_links.txt | 1 - .../requires.txt | 23 - .../top_level.txt | 4 - 5 files changed, 506 deletions(-) delete mode 100644 src/discord_cluster_manager.egg-info/PKG-INFO delete mode 100644 src/discord_cluster_manager.egg-info/SOURCES.txt delete mode 100644 src/discord_cluster_manager.egg-info/dependency_links.txt delete mode 100644 src/discord_cluster_manager.egg-info/requires.txt delete mode 100644 src/discord_cluster_manager.egg-info/top_level.txt diff --git a/src/discord_cluster_manager.egg-info/PKG-INFO b/src/discord_cluster_manager.egg-info/PKG-INFO deleted file mode 100644 index 1e9fe5a0..00000000 --- a/src/discord_cluster_manager.egg-info/PKG-INFO +++ /dev/null @@ -1,407 +0,0 @@ -Metadata-Version: 2.4 -Name: discord-cluster-manager -Version: 0.1.0 -Summary: Discord bot for managing compute clusters and running kernel benchmarks -Requires-Python: >=3.10 -Description-Content-Type: text/markdown -License-File: LICENSE -Requires-Dist: PyGithub -Requires-Dist: aiohttp -Requires-Dist: discord.py -Requires-Dist: audioop-lts; python_version >= "3.13" -Requires-Dist: python-dotenv -Requires-Dist: requests -Requires-Dist: modal -Requires-Dist: psycopg2-binary -Requires-Dist: yoyo-migrations -Requires-Dist: better_profanity -Requires-Dist: PyYAML -Requires-Dist: fastapi[all] -Requires-Dist: uvicorn -Requires-Dist: jinja2 -Provides-Extra: dev -Requires-Dist: ruff; extra == "dev" -Requires-Dist: pre-commit; extra == "dev" -Requires-Dist: pytest; extra == "dev" -Requires-Dist: pytest-coverage; extra == "dev" -Requires-Dist: pytest-asyncio; extra == "dev" -Dynamic: license-file - -# KernelBot - -[![nvidia-on-prem](https://github.com/gpu-mode/discord-cluster-manager/actions/workflows/nvidia-on-prem-health.yml/badge.svg)](https://github.com/gpu-mode/discord-cluster-manager/actions/workflows/nvidia-on-prem-health.yml) - -This is the code for the Discord bot we'll be using to queue jobs to a cluster of GPUs that our generous sponsors have provided. Our goal is to be able to queue kernels that can run end to end in seconds that way things feel interactive and social. - -The key idea is that we're using Github Actions as a job scheduling engine and primarily making the Discord bot interact with the cluster via issuing Github Actions and and monitoring their status and while we're focused on having a nice user experience on discord.gg/gpumode, [we're happy to accept PRs](#local-development) that make it easier for other Discord communities to hook GPUs. - -## Table of Contents - -- [Supported Schedulers](#supported-schedulers) -- [Local Development](#local-development) - - [Clone Repository](#clone-repository) - - [Discord Bot](#discord-bot) - - [Database](#database) - - [Environment Variables](#environment-variables) - - [Verify Setup](#verify-setup) -- [Available Commands](#available-commands) -- [Using the Leaderboard](#using-the-leaderboard) - - [Creating a New Leaderboard](#creating-a-new-leaderboard) - - [Reference Code Requirements (Python)](#reference-code-requirements-python) - - [Reference Code Requirements (CUDA)](#reference-code-requirements-cuda) - - [Submitting to a Leaderboard](#submitting-to-a-leaderboard) - - [Other Available Leaderboard Commands](#other-available-leaderboard-commands) - - [GPU Kernel-Specific Commands](#gpu-kernel-specific-commands) -- [Testing the Discord Bot](#testing-the-discord-bot) -- [How to Add a New GPU to the Cluster](#how-to-add-a-new-gpu-to-the-cluster) -- [Acknowledgements](#acknowledgements) - -## Supported schedulers - -- GitHub Actions -- Modal -- Slurm (not implemented yet) - -## Local Development - -### Clone Repository - -> [!IMPORTANT] -> Do not fork this repository. Instead, directly clone this repository to your local machine. - -> [!IMPORTANT] -> Python 3.11 or higher is required. - -After, install the dependencies with `pip install -r requirements-dev.txt`. - -### Setup Discord Bot - -To run and develop the bot locally, you need to add it to your own "staging" server. Follow the steps [here](https://discordjs.guide/preparations/setting-up-a-bot-application.html#creating-your-bot) and [here](https://discordjs.guide/preparations/adding-your-bot-to-servers.html#bot-invite-links) to create a bot application and then add it to your staging server. - -Below is a visual walk-through of the steps linked above: - -- The bot needs the `Message Content Intent` and `Server Members Intent` permissions turned on. -
- Click here for visual. - DCS_bot_perms -
- -- The bot needs `applications.commands` and `bot` scopes. - -
- Click here for visual. - Screenshot 2024-11-24 at 12 34 09 PM -
- -- Finally, generate an invite link for the bot and enter it into any browser. - -
- Click here for visual. - Screenshot 2024-11-24 at 12 44 08 PM -
- -> [!NOTE] -> Bot permissions involving threads/mentions/messages should suffice, but you can naively give it `Administrator` since it's just a test bot in your own testing Discord server. - -### Database - -The leaderboard persists information in a Postgres database. To develop locally, set Postgres up on your machine. Then start a Postgres shell with `psql`, and create a database: - -``` -$ psql -U postgres -Password for user postgres: ******** -psql (16.6 (Ubuntu 16.6-1.pgdg22.04+1)) -Type "help" for help. - -postgres=# CREATE DATABASE clusterdev; -``` - -We are using [Yoyo Migrations](https://ollycope.com/software/yoyo/) to manage tables, indexes, etc. in our database. To create tables in your local database, apply the migrations in `src/discord-cluster-manager/migrations` with the following command line: - -``` -yoyo apply src/migrations \ - -d postgresql://user:password@localhost/clusterdev -``` - -
- Click here for a transcript of a yoyo apply session - - $ yoyo apply . -d postgresql://user:password@localhost/clusterdev - - [20241208_01_p3yuR-initial-leaderboard-schema] - Shall I apply this migration? [Ynvdaqjk?]: y - - Selected 1 migration: - [20241208_01_p3yuR-initial-leaderboard-schema] - Apply this migration to postgresql://user:password@localhost/clusterdev [Yn]: y - Save migration configuration to yoyo.ini? - This is saved in plain text and contains your database password. - - Answering 'y' means you do not have to specify the migration source or database connection for future runs [yn]: n - -
- -Applying migrations to our staging and prod environments also happens using `yoyo apply`, just with a different database URL. - -To make changes to the structure of the database, create a new migration: - -``` -yoyo new src/discord-cluster-manager/migrations -m "short_description" -``` - -...and then edit the generated file. Please do not edit existing migration files: the existing migration files form a sort of changelog that is supposed to be immutable, and so yoyo will refuse to reapply the changes. - -We are following an expand/migrate/contract pattern to allow database migrations without downtime. When you want to make a change to the structure of the database, first determine if it is expansive or contractive. - -- _Expansive changes_ are those that have no possibility of breaking a running application. Examples include: adding a new nullable column, adding a non-null column with a default value, adding an index, adding a table, etc. -- _Contractive changes_ are those that could break a running application. Examples include: dropping a table, dropping a column, adding a not null constraint to a column, adding a unique index, etc. - -After an expansive phase, data gets migrated to the newly added elements. Code also begins using the newly added elements. This is the migration step. Finally, when all code is no longer using elements that are obsolete, these can be removed. (Or, if adding a unique or not null constraint, after checking that the data satisfies the constraint, then the constraint can be safely added.) - -Expand, migrate, and contract steps may all be written using yoyo. - -### Environment Variables - -Create a `.env` file with the following environment variables: - -- `DISCORD_DEBUG_TOKEN` : The token of the bot you want to run locally -- `DISCORD_TOKEN` : The token of the bot you want to run in production -- `DISCORD_DEBUG_CLUSTER_STAGING_ID` : The ID of the "staging" server you want to connect to -- `DISCORD_CLUSTER_STAGING_ID` : The ID of the "production" server you want to connect to -- `GITHUB_TOKEN` : A Github token with permissions to trigger workflows, for now only new branches from [discord-cluster-manager](https://github.com/gpu-mode/discord-cluster-manager) are tested, since the bot triggers workflows on your behalf -- `GITHUB_REPO` : The repository where the cluster manager is hosted. -- `GITHUB_WORKFLOW_BRANCH` : The branch to start the GitHub Actions jobs from when submitting a task. -- `DATABASE_URL` : The URL you use to connect to Postgres. -- `DISABLE_SSL` : (Optional) set if you want to disable SSL when connecting to Postgres. - -Below is where to find these environment variables: - -> [!NOTE] -> For now, you can naively set `DISCORD_DEBUG_TOKEN` and `DISCORD_DEBUG_CLUSTER_STAGING_ID` to the same values as `DISCORD_TOKEN` and `DISCORD_CLUSTER_STAGING_ID` respectively. - -- `DISCORD_DEBUG_TOKEN` or `DISCORD_TOKEN`: Found in your bot's page within the [Discord Developer Portal](https://discord.com/developers/applications/): - -
- Click here for visual. - Screenshot 2024-11-24 at 11 01 19 AM -
- -- `DISCORD_DEBUG_CLUSTER_STAGING_ID` or `DISCORD_CLUSTER_STAGING_ID`: Right-click your staging Discord server and select `Copy Server ID`: - -
- Click here for visual. - Screenshot 2024-11-24 at 10 58 27 AM -
- -- `GITHUB_TOKEN`: Found in Settings -> Developer Settings (or [here](https://github.com/settings/tokens?type=beta)). Create a new (preferably classic) personal access token with an expiration date to any day less than a year from the current date, and the scopes `repo` and `workflow`. - -
- Click here for visual. - Screenshot 2024-12-30 at 8 51 59 AM -
- -- `GITHUB_REPO`: This should be set to this repository, which is usually `gpu-mode/discord-cluster-manager`. - -- `GITHUB_WORKFLOW_BRANCH`: Usually `main` or the branch you are working from. - -- `DATABASE_URL`: This contains the connection details for your local database, and has the form `postgresql://user:password@localhost/clusterdev`. - -- `DISABLE_SSL`: Set to `1` when developing. - -### Verify Setup - -Install the kernel bot as editable using `pip install -e .` - -Run the following command to run the bot: - -``` -python src/kernelbot/main.py --debug -``` - -Then in your staging server, use the `/verifyruns` command to test basic functionalities of the bot and the `/verifydb` command to check database connectivity. - -> [!NOTE] -> To test functionality of the Modal runner, you also need to be authenticated with Modal. Modal provides free credits to get started. -> To test functionality of the GitHub runner, you may need direct access to this repo which you can ping us for. - -## Available Commands - -TODO. This is currently a work in progress. - -`/run modal ` which you can use to pick a specific gpu, right now defaults to T4 - -`/run github ` which picks one of two workflow files - -`/resync` to clear all the commands and resync them - -`/ping` to check if the bot is online - -## Using the Leaderboard - -The main purpose of the Discord bot is to allow servers to host coding competitions through Discord. -The leaderboard was designed for evaluating GPU kernels, but can be adapted easily for other -competitions. The rest of this section will mostly refer to leaderboard submissions in the context -of our GPU Kernel competition. - -> [!NOTE] -> All leaderboard commands have the prefix `/leaderboard`, and center around creating, submitting to, -> and viewing leaderboard statistics and information. - -### Creating a new Leaderboard - -``` -/leaderboard create {name: str} {deadline: str} {reference_code: .cu or .py file} -``` - -The above command creates a leaderboard named `name` that ends at `deadline`. The `reference_code` -has strict function signature requirements, and is required to contain an input generator and a -reference implementation for the desired GPU kernel. We import these functions in our evaluation -scripts for verifying leaderboard submissions and measuring runtime. In the next mini-section, we -discuss the exact requirements for the `reference_code` script. - -Each leaderboard `name` can also specify the types of hardware that users can run their kernels on. -For example, a softmax kernel on an RTX 4090 can have different performance characteristics on an -H100. After running the leaderboard creation command, a prompt will pop up where the creator can -specify the available GPUs that the leaderboard evaluates on. - -![Leaderboard GPU](assets/img/lb_gpu.png) - -#### Reference Code Requirements (Python) - -The Discord bot internally contains an `eval.py` script that handles the correctness and timing -analysis for the leaderboard. The `reference_code` that the leaderboard creator submits must have -the following function signatures with their implementations filled out. `InputType` and -`OutputType` are generics that could be a `torch.Tensor`, `List[torch.Tensor]`, etc. -depending on the reference code specifications. We leave this flexibility to the leaderboard creator. - -```python -# Reference kernel implementation. -def ref_kernel(input: InputType) -> OutputType: - # Implement me... - -# Generate a list of tensors as input to the kernel -def generate_input() -> InputType: - # Implement me... - -# Verify correctness of reference and output -def check_implementation(custom_out: OutputType, reference_out: OutputType) -> bool: - # Implement me... -``` - -#### Reference Code Requirements (CUDA) - -The Discord bot internally contains an `eval.cu` script that handles the correctness and timing -analysis for the leaderboard. The difficult of CUDA evaluation scripts is we need to explicitly -handle the typing system for tensors. The `reference.cu` that the leaderboard creator submits must have -the following function signatures with their implementations filled out: - -The main difference is we now need to define an alias for the type that the input / outputs are. A -simple and common example is a list of FP32 tensors, which can be defined using a pre-defined array of -`const int`s called `N_SIZES`, then define an array of containers, e.g. -`std::array, N_SIZES>`. - -```cuda -// User-defined type for inputs, e.g. using input_t = std::array, IN_SIZES>; -using input_t = ...; - -// User-defined type for outputs, e.g. using output_t = std::array, OUT_SIZES>; -using output_t = ...; - -// Generate random data of type input_t -input_t generate_input() { - // Implement me... -} - - -// Reference kernel host code. -output_t reference(input_t data) { - // Implement me... -} - - -// Verify correctness of reference and output -bool check_implementation(output_t out, output_t ref) { - // Implement me... -} -``` - -### Submitting to a Leaderboard - -``` -/leaderboard submit {github / modal} {leaderboard_name: str} {script: .cu or .py file} -``` - -The leaderboard submission for _Python code_ requires the following function signatures, where -`InputType` and `OutputType` are generics that could be a `torch.Tensor`, `List[torch.Tensor]`, -etc. depending on the reference code specifications. - -```python -# User kernel implementation. -def custom_kernel(input: InputType) -> OutputType: - # Implement me... -``` - -### Other Available Leaderboard Commands - -Deleting a leaderboard: - -``` -/leaderboard delete {name: str} -``` - -List all active leaderboards and which GPUs they can run on: - -``` -/leaderboard list -``` - -List all leaderboard scores (runtime) for a particular leaderboard. (currently deprecated. Doesn't -support multiple GPU types yet) - -``` -/leaderboard show {name: str} -``` - -Display all personal scores (runtime) from a specific leaderboard. - -``` -/leaderboard show-personal {name: str} -``` - -### Submitting via a CLI - -Moving forward we also allow submissions without logging in to Discord via a CLI tool we wrote in Rust https://github.com/gpu-mode/popcorn-cli - -#### GPU Kernel-specific Commands - -We plan to add support for the PyTorch profiler and CUDA NSight Compute CLI to allow users to -profile their kernels. These commands are not specific to the leaderboard, but may be helpful for -leaderboard submissions. - -## How to add a new GPU to the cluster - -If you'd like to donate a GPU to our efforts, we can make you a CI admin in Github and have you add an org level runner https://github.com/organizations/gpu-mode/settings/actions/runners - -## Acknowledgements - -- Thank you to AMD for sponsoring an MI250 node -- Thank you to NVIDIA for sponsoring an H100 node -- Thank you to Nebius for sponsoring credits and an H100 node -- Thank you Modal for credits and speedy spartup times -- Luca Antiga did something very similar for the NeurIPS LLM efficiency competition, it was great! -- Midjourney was a similar inspiration in terms of UX - -## Citation -If you used our software please cite it as - -``` -@inproceedings{ - kernelbot2025, - title={KernelBot: A Competition Platform for Writing Heterogeneous {GPU} Code}, - author={Alex L Zhang and Matej Sirovatka and Erik Schultheis and Benjamin Horowitz and Mark Saroufim}, - note={Equal Contribution}, - booktitle={Championing Open-source Development in ML Workshop @ ICML25}, - year={2025}, - url={https://openreview.net/forum?id=bq9U4dmuyJ} -} -``` diff --git a/src/discord_cluster_manager.egg-info/SOURCES.txt b/src/discord_cluster_manager.egg-info/SOURCES.txt deleted file mode 100644 index 06f31a50..00000000 --- a/src/discord_cluster_manager.egg-info/SOURCES.txt +++ /dev/null @@ -1,71 +0,0 @@ -LICENSE -README.md -pyproject.toml -src/discord_cluster_manager.egg-info/PKG-INFO -src/discord_cluster_manager.egg-info/SOURCES.txt -src/discord_cluster_manager.egg-info/dependency_links.txt -src/discord_cluster_manager.egg-info/requires.txt -src/discord_cluster_manager.egg-info/top_level.txt -src/kernelbot/discord_reporter.py -src/kernelbot/discord_utils.py -src/kernelbot/env.py -src/kernelbot/main.py -src/kernelbot/api/__init__.py -src/kernelbot/api/api_utils.py -src/kernelbot/api/main.py -src/kernelbot/cogs/__init__.py -src/kernelbot/cogs/admin_cog.py -src/kernelbot/cogs/leaderboard_cog.py -src/kernelbot/cogs/misc_cog.py -src/kernelbot/cogs/verify_run_cog.py -src/kernelbot/ui/misc.py -src/kernelbot/ui/table.py -src/libkernelbot/__init__.py -src/libkernelbot/backend.py -src/libkernelbot/background_submission_manager.py -src/libkernelbot/consts.py -src/libkernelbot/db_types.py -src/libkernelbot/leaderboard_db.py -src/libkernelbot/report.py -src/libkernelbot/run_eval.py -src/libkernelbot/submission.py -src/libkernelbot/task.py -src/libkernelbot/utils.py -src/libkernelbot/launchers/__init__.py -src/libkernelbot/launchers/github.py -src/libkernelbot/launchers/launcher.py -src/libkernelbot/launchers/modal.py -src/migrations/20241208_01_p3yuR-initial-leaderboard-schema.py -src/migrations/20241214_01_M62BX-drop-old-leaderboard-tables.py -src/migrations/20241221_01_54Oeg-rename-problem-table.py -src/migrations/20241222_01_ELxU5-add-gpu-types.py -src/migrations/20241224_01_Pg4FX-delete-cascade.py -src/migrations/20241226_01_ZQSOK-add_gpu_type_to_submission.py -src/migrations/20250106_01_Sgph3-add-leaderboard-creator-id.py -src/migrations/20250202_01_YYS3Q-leaderboard-rename-reference-to-task.py -src/migrations/20250221_01_GA8ro-submission-collection.py -src/migrations/20250228_01_9ANYn-submission-add-user-name.py -src/migrations/20250304_01_DzORz-collect-system-information-for-each-run.py -src/migrations/20250316_01_5oMi3-remember-forum-id.py -src/migrations/20250329_01_7VjJJ-add-a-secret-seed-column.py -src/migrations/20250406_01_ZXjWK-user-info-add-cli-id.py -src/migrations/20250412_01_l7Dra-user-info-fix-auth.py -src/migrations/20250412_02_NN9kK-user-info-cli-drop-old.py -src/migrations/20250506_01_38PkG-add-index-on-runs-runner-score.py -src/migrations/20250617_01_c5mrF-task-split.py -src/migrations/20250728_01_Q3jso-fix-code-table.py -src/migrations/20250822_01_UtXzl-website-submission.py -src/migrations/20251106_01_kOjGy-draft-code-editor.py -src/migrations/20260108_01_gzSm3-add-submission-status.py -src/runners/github-runner.py -src/runners/modal_runner.py -src/runners/modal_runner_archs.py -tests/test_backend.py -tests/test_background_submission_manager.py -tests/test_github.py -tests/test_leaderboard_db.py -tests/test_modal.py -tests/test_report.py -tests/test_submission.py -tests/test_task.py -tests/test_utils.py \ No newline at end of file diff --git a/src/discord_cluster_manager.egg-info/dependency_links.txt b/src/discord_cluster_manager.egg-info/dependency_links.txt deleted file mode 100644 index 8b137891..00000000 --- a/src/discord_cluster_manager.egg-info/dependency_links.txt +++ /dev/null @@ -1 +0,0 @@ - diff --git a/src/discord_cluster_manager.egg-info/requires.txt b/src/discord_cluster_manager.egg-info/requires.txt deleted file mode 100644 index eb2fb39a..00000000 --- a/src/discord_cluster_manager.egg-info/requires.txt +++ /dev/null @@ -1,23 +0,0 @@ -PyGithub -aiohttp -discord.py -python-dotenv -requests -modal -psycopg2-binary -yoyo-migrations -better_profanity -PyYAML -fastapi[all] -uvicorn -jinja2 - -[:python_version >= "3.13"] -audioop-lts - -[dev] -ruff -pre-commit -pytest -pytest-coverage -pytest-asyncio diff --git a/src/discord_cluster_manager.egg-info/top_level.txt b/src/discord_cluster_manager.egg-info/top_level.txt deleted file mode 100644 index e90efa46..00000000 --- a/src/discord_cluster_manager.egg-info/top_level.txt +++ /dev/null @@ -1,4 +0,0 @@ -kernelbot -libkernelbot -migrations -runners From fa8084a14bcbb742ecb89bdbcf3f471c79210cad Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Thu, 29 Jan 2026 13:50:51 -0800 Subject: [PATCH 4/4] update --- .buildkite/pipeline.yml | 7 ++- src/libkernelbot/consts.py | 8 +++ src/libkernelbot/launchers/buildkite.py | 3 +- src/runners/buildkite-runner.py | 68 +++++++++++++++++-------- tests/test_buildkite.py | 6 ++- 5 files changed, 68 insertions(+), 24 deletions(-) diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index be68002a..7fcc9928 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -9,6 +9,10 @@ # Environment variables passed from BuildkiteLauncher: # - SUBMISSION_PAYLOAD: Base64-encoded, zlib-compressed submission config # - GPU_QUEUE: Queue name for agent routing +# +# Note: timeout_in_minutes is set high (60) as a safety net. +# The BuildkiteLauncher handles dynamic timeouts based on submission mode +# and will cancel jobs that exceed their configured timeout. steps: - label: ":gpu: Run Kernel Submission" @@ -19,7 +23,8 @@ steps: agents: # Route to agent with matching queue tag queue: "${GPU_QUEUE}" - timeout_in_minutes: 15 + # Safety timeout - BuildkiteLauncher handles actual timeout enforcement + timeout_in_minutes: 60 artifact_paths: - "result.json" - "profile_data/**/*" diff --git a/src/libkernelbot/consts.py b/src/libkernelbot/consts.py index e5beabee..bbb37aad 100644 --- a/src/libkernelbot/consts.py +++ b/src/libkernelbot/consts.py @@ -135,6 +135,14 @@ class RankCriterion(Enum): "MI300": None, "MI300x8": None, "MI250": None, + # Buildkite GPU types (vendor-prefixed queue names) + "nvidia-h100": "90a", + "nvidia-b200": "100", + "nvidia-a100": "80", + "nebius-h100": "90a", + "amd-mi300": None, + "amd-mi250": None, + "google-tpu": None, } diff --git a/src/libkernelbot/launchers/buildkite.py b/src/libkernelbot/launchers/buildkite.py index 78aa7b09..32542247 100644 --- a/src/libkernelbot/launchers/buildkite.py +++ b/src/libkernelbot/launchers/buildkite.py @@ -221,7 +221,8 @@ async def _wait_for_completion( if state in ("passed", "failed", "canceled", "blocked"): if state != "passed": logger.warning(f"Build #{build_number} finished with state: {state}") - return + raise RuntimeError(f"Build #{build_number} {state}") + return state await callback(state, elapsed) await asyncio.sleep(10) # Poll every 10 seconds diff --git a/src/runners/buildkite-runner.py b/src/runners/buildkite-runner.py index 9bae67fb..aec139be 100644 --- a/src/runners/buildkite-runner.py +++ b/src/runners/buildkite-runner.py @@ -13,6 +13,8 @@ import base64 import json import os +import sys +import traceback import zlib from dataclasses import asdict from datetime import datetime @@ -28,28 +30,52 @@ def serialize(obj: object): raise TypeError(f"Type {type(obj)} not serializable") +def write_error_result(error_message: str): + """Write an error result to result.json when execution fails.""" + error_result = { + "success": False, + "error": error_message, + "runs": {}, + "system": {}, + } + Path("result.json").write_text(json.dumps(error_result, default=serialize)) + + def main(): - # Get payload from environment variable - payload_b64 = os.environ.get("SUBMISSION_PAYLOAD") - if not payload_b64: - raise RuntimeError("SUBMISSION_PAYLOAD environment variable not set") - - # Decompress and parse config - payload = zlib.decompress(base64.b64decode(payload_b64)).decode("utf-8") - config = json.loads(payload) - - # Run the submission - result = run_config(config) - - # Write result to file for artifact upload - result_dict = asdict(result) - Path("result.json").write_text(json.dumps(result_dict, default=serialize)) - - # Create profile_data directory if profiling was enabled - # (profile artifacts will be written there by run_config) - profile_dir = Path("profile_data") - if profile_dir.exists(): - print(f"Profile data available in {profile_dir}") + try: + # Get payload from environment variable + payload_b64 = os.environ.get("SUBMISSION_PAYLOAD") + if not payload_b64: + write_error_result("SUBMISSION_PAYLOAD environment variable not set") + sys.exit(1) + + # Decompress and parse config + try: + payload = zlib.decompress(base64.b64decode(payload_b64)).decode("utf-8") + config = json.loads(payload) + except Exception as e: + write_error_result(f"Failed to decompress/parse payload: {e}") + sys.exit(1) + + # Run the submission + result = run_config(config) + + # Write result to file for artifact upload + result_dict = asdict(result) + Path("result.json").write_text(json.dumps(result_dict, default=serialize)) + + # Create profile_data directory if profiling was enabled + # (profile artifacts will be written there by run_config) + profile_dir = Path("profile_data") + if profile_dir.exists(): + print(f"Profile data available in {profile_dir}") + + except Exception as e: + # Catch any unexpected errors and write them to result.json + error_msg = f"Runner error: {e}\n{traceback.format_exc()}" + print(error_msg, file=sys.stderr) + write_error_result(error_msg) + sys.exit(1) if __name__ == "__main__": diff --git a/tests/test_buildkite.py b/tests/test_buildkite.py index 82937dc4..ac938ee4 100644 --- a/tests/test_buildkite.py +++ b/tests/test_buildkite.py @@ -70,7 +70,8 @@ def mock_config(self): return { "lang": "py", "mode": "test", - "files": {"main.py": "print('hello')"}, + "main": "main.py", + "sources": {"main.py": "print('hello')"}, "tests": [], "benchmarks": [], "test_timeout": 180, @@ -141,6 +142,9 @@ async def test_run_submission_creates_build(self, launcher, mock_config, gpu_typ assert "SUBMISSION_PAYLOAD" in body["env"] assert body["env"]["GPU_QUEUE"] == "nvidia-h100" + # Verify result + assert result.success is True + @pytest.mark.asyncio async def test_run_submission_handles_api_error(self, launcher, mock_config, gpu_type): """Test that API errors are handled gracefully."""