diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml new file mode 100644 index 00000000..7fcc9928 --- /dev/null +++ b/.buildkite/pipeline.yml @@ -0,0 +1,50 @@ +# Buildkite Pipeline for Kernel Submissions +# +# This pipeline runs kernel submissions on GPU-bound Buildkite agents. +# Each agent is configured with: +# - CUDA_VISIBLE_DEVICES bound to a single GPU +# - CPU/RAM limits via systemd cgroups +# - Queue tag for GPU routing (e.g., queue=nvidia-h100-0) +# +# Environment variables passed from BuildkiteLauncher: +# - SUBMISSION_PAYLOAD: Base64-encoded, zlib-compressed submission config +# - GPU_QUEUE: Queue name for agent routing +# +# Note: timeout_in_minutes is set high (60) as a safety net. +# The BuildkiteLauncher handles dynamic timeouts based on submission mode +# and will cancel jobs that exceed their configured timeout. + +steps: + - label: ":gpu: Run Kernel Submission" + command: "python /opt/kernelbot/buildkite-runner.py" + env: + # Payload is passed via BuildkiteLauncher + SUBMISSION_PAYLOAD: "${SUBMISSION_PAYLOAD}" + agents: + # Route to agent with matching queue tag + queue: "${GPU_QUEUE}" + # Safety timeout - BuildkiteLauncher handles actual timeout enforcement + timeout_in_minutes: 60 + artifact_paths: + - "result.json" + - "profile_data/**/*" + plugins: + - docker#v5.11.0: + image: "ghcr.io/gpu-mode/kernelbot-runner:latest" + always-pull: true + propagate-environment: true + # GPU access - agent already bound to single GPU via CUDA_VISIBLE_DEVICES + gpus: all + # Resource limits (can be overridden via env vars) + memory: "${MEMORY_LIMIT:-32g}" + cpus: "${CPU_LIMIT:-16}" + # Mount working directory for artifacts + volumes: + - ".:/workdir" + workdir: "/workdir" + retry: + automatic: + - exit_status: -1 # Agent lost connection + limit: 1 + - exit_status: 255 # SSH error + limit: 1 diff --git a/.github/workflows/build-runner-image.yml b/.github/workflows/build-runner-image.yml new file mode 100644 index 00000000..996a9424 --- /dev/null +++ b/.github/workflows/build-runner-image.yml @@ -0,0 +1,107 @@ +name: Build Runner Image + +on: + push: + branches: + - main + paths: + - 'docker/kernelbot-runner/**' + - 'src/libkernelbot/**' + - 'src/runners/buildkite-runner.py' + - '.github/workflows/build-runner-image.yml' + pull_request: + paths: + - 'docker/kernelbot-runner/**' + - 'src/libkernelbot/**' + - 'src/runners/buildkite-runner.py' + workflow_dispatch: + inputs: + push: + description: 'Push image to registry' + required: false + default: 'true' + type: boolean + schedule: + # Rebuild weekly on Sundays at 2 AM UTC + - cron: '0 2 * * 0' + +env: + REGISTRY: ghcr.io + IMAGE_NAME: gpu-mode/kernelbot-runner + +jobs: + build: + runs-on: ubuntu-latest + permissions: + contents: read + packages: write + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Log in to Container Registry + if: github.event_name != 'pull_request' + uses: docker/login-action@v3 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Extract metadata for Docker + id: meta + uses: docker/metadata-action@v5 + with: + images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} + tags: | + type=raw,value=latest,enable={{is_default_branch}} + type=sha,prefix=sha- + type=ref,event=branch + type=ref,event=pr + + - name: Build and push Docker image + uses: docker/build-push-action@v5 + with: + context: . + file: docker/kernelbot-runner/Dockerfile + push: ${{ github.event_name != 'pull_request' && (github.event.inputs.push != 'false') }} + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + cache-from: type=gha + cache-to: type=gha,mode=max + + - name: Generate build summary + run: | + echo "## Docker Image Build Summary" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "**Image:** \`${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}\`" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "**Tags:**" >> $GITHUB_STEP_SUMMARY + echo '```' >> $GITHUB_STEP_SUMMARY + echo "${{ steps.meta.outputs.tags }}" >> $GITHUB_STEP_SUMMARY + echo '```' >> $GITHUB_STEP_SUMMARY + + - name: Notify vendors (Slack) + if: github.event_name == 'push' && github.ref == 'refs/heads/main' + continue-on-error: true + uses: slackapi/slack-github-action@v1.25.0 + with: + payload: | + { + "text": "New kernelbot-runner image published", + "blocks": [ + { + "type": "section", + "text": { + "type": "mrkdwn", + "text": "*New kernelbot-runner image published* :package:\n\nVendors: run `./scripts/buildkite/update-image.sh` to update your agents.\n\n<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|View build>" + } + } + ] + } + env: + SLACK_WEBHOOK_URL: ${{ secrets.SLACK_VENDOR_WEBHOOK }} + SLACK_WEBHOOK_TYPE: INCOMING_WEBHOOK diff --git a/.gitignore b/.gitignore index 5c184087..69c663da 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,5 @@ reference-kernels/ yoyo.ini .venv .claude/ +*.egg +*.egg-info/ diff --git a/docker/kernelbot-runner/Dockerfile b/docker/kernelbot-runner/Dockerfile new file mode 100644 index 00000000..8708f2fe --- /dev/null +++ b/docker/kernelbot-runner/Dockerfile @@ -0,0 +1,67 @@ +# Kernelbot Runner Docker Image +# +# This image is used by Buildkite agents to run kernel submissions. +# It matches the Modal runner configuration for consistent behavior. +# +# Build: +# docker build -t ghcr.io/gpu-mode/kernelbot-runner:latest -f docker/kernelbot-runner/Dockerfile . +# +# Run locally (for testing): +# docker run --gpus '"device=0"' -e SUBMISSION_PAYLOAD="..." kernelbot-runner:latest + +FROM nvidia/cuda:13.1.0-devel-ubuntu24.04 + +LABEL org.opencontainers.image.source="https://github.com/gpu-mode/kernelbot" +LABEL org.opencontainers.image.description="Kernelbot GPU runner for kernel competitions" + +# Install system dependencies +RUN apt-get update && apt-get install -y --no-install-recommends \ + python3.13 \ + python3.13-venv \ + python3-pip \ + git \ + gcc-13 \ + g++-13 \ + clang-18 \ + curl \ + && rm -rf /var/lib/apt/lists/* \ + && ln -sf /usr/bin/python3.13 /usr/bin/python3 \ + && ln -sf /usr/bin/python3.13 /usr/bin/python + +# Create virtual environment +RUN python3 -m venv /opt/venv +ENV PATH="/opt/venv/bin:$PATH" + +# Install Python dependencies (matching modal_runner.py) +COPY docker/kernelbot-runner/requirements-runner.txt /tmp/ +RUN pip install --upgrade pip && \ + pip install -r /tmp/requirements-runner.txt + +# Install PyTorch with CUDA 13.0 support +RUN pip install \ + torch==2.9.1 \ + torchvision \ + torchaudio \ + --index-url https://download.pytorch.org/whl/cu130 + +# Install additional frameworks +RUN pip install \ + tinygrad~=0.10 + +# Install NVIDIA CUDA packages +RUN pip install \ + nvidia-cupynumeric~=25.3 \ + nvidia-cutlass-dsl==4.3.5 \ + "cuda-core[cu13]" \ + "cuda-python[all]==13.0" + +# Copy kernelbot library and runner +WORKDIR /opt/kernelbot +COPY src/libkernelbot /opt/kernelbot/libkernelbot +COPY src/runners/buildkite-runner.py /opt/kernelbot/ + +# Set PYTHONPATH so libkernelbot is importable +ENV PYTHONPATH="/opt/kernelbot:$PYTHONPATH" + +# Default command +CMD ["python", "/opt/kernelbot/buildkite-runner.py"] diff --git a/docker/kernelbot-runner/requirements-runner.txt b/docker/kernelbot-runner/requirements-runner.txt new file mode 100644 index 00000000..8ba03df9 --- /dev/null +++ b/docker/kernelbot-runner/requirements-runner.txt @@ -0,0 +1,17 @@ +# Kernelbot Runner Dependencies +# These should match the Modal runner configuration in modal_runner.py + +# Build tools +ninja~=1.11 +wheel~=0.45 +setuptools + +# Core dependencies +requests~=2.32.4 +packaging~=25.0 +numpy~=2.3 +pytest +PyYAML + +# Triton for GPU kernels +triton diff --git a/docs/BUILDKITE_POC.md b/docs/BUILDKITE_POC.md new file mode 100644 index 00000000..0afe10ed --- /dev/null +++ b/docs/BUILDKITE_POC.md @@ -0,0 +1,304 @@ +# Buildkite Integration POC + +## Executive Summary + +This document describes a proof-of-concept implementation of Buildkite as a new scheduler for Kernelbot GPU kernel competitions. Buildkite solves critical isolation problems that make microbenchmarking on vendor-donated hardware unreliable. + +**Status**: Implementation complete, unit tests passing, ready for integration testing with real Buildkite agents. + +--- + +## Problem Statement + +When vendors donate GPU compute for kernel competitions, we face these challenges: + +| Problem | Impact | +|---------|--------| +| Multiple kernels on same GPU | Measurements become unreliable | +| No CPU/RAM isolation | Neighbor jobs affect benchmarks | +| Complex runner setup | Vendors spend weeks configuring isolation | +| No standardized onboarding | Each vendor does it differently | + +### Current State + +- **Modal**: Good isolation but cloud-only, can't use donated on-prem hardware +- **GitHub Actions**: Runners see all GPUs, no resource limits, complex setup + +--- + +## Solution: Buildkite + +Buildkite provides the primitives we need for proper isolation: + +| Requirement | Buildkite Solution | +|-------------|-------------------| +| 1 GPU per job | 1 agent per GPU, bound via `CUDA_VISIBLE_DEVICES` | +| CPU/RAM limits | Agent runs in systemd cgroup slice | +| No interference | Agent processes 1 job at a time (default) | +| Queue routing | Agent tags route jobs to specific GPUs | +| Easy onboarding | Bootstrap script + Dockerfile in our repo | + +### Architecture + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ Vendor Node (8x H100, 256GB RAM, 128 cores) │ +├─────────────────────────────────────────────────────────────────┤ +│ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │ +│ │ Agent 0 │ │ Agent 1 │ ... │ Agent 7 │ │ +│ │ GPU 0 only │ │ GPU 1 only │ │ GPU 7 only │ │ +│ │ 32GB RAM │ │ 32GB RAM │ │ 32GB RAM │ │ +│ │ 16 CPU cores │ │ 16 CPU cores │ │ 16 CPU cores │ │ +│ │ queue= │ │ queue= │ │ queue= │ │ +│ │ nvidia-h100-0│ │ nvidia-h100-1│ │ nvidia-h100-7│ │ +│ └──────────────┘ └──────────────┘ └──────────────┘ │ +└─────────────────────────────────────────────────────────────────┘ +``` + +### Queue Naming Convention + +Format: `{vendor}-{gpu_type}-{index}` + +Examples: +- `nvidia-h100-0` - NVIDIA-donated H100, first GPU +- `amd-mi300-0` - AMD-donated MI300 +- `google-tpu-0` - Google-donated TPU +- `nebius-h100-0` - Nebius-donated H100 + +This supports concurrent competitions where different vendors donate the same GPU type. + +--- + +## Implementation + +### Files Created + +| File | Purpose | +|------|---------| +| `src/libkernelbot/launchers/buildkite.py` | BuildkiteLauncher class | +| `src/runners/buildkite-runner.py` | Runner script for agents | +| `docker/kernelbot-runner/Dockerfile` | Container image (source of truth) | +| `docker/kernelbot-runner/requirements-runner.txt` | Python dependencies | +| `.buildkite/pipeline.yml` | Buildkite pipeline config | +| `scripts/buildkite/setup-agent.sh` | Agent bootstrap script | +| `scripts/buildkite/update-image.sh` | Image update script | +| `.github/workflows/build-runner-image.yml` | Auto-build on changes | +| `docs/docs/vendor-onboarding/buildkite.md` | Vendor setup guide | +| `docs/docs/vendor-onboarding/testing-guide.md` | Testing instructions | +| `tests/test_buildkite.py` | Unit tests | + +### Files Modified + +| File | Changes | +|------|---------| +| `src/libkernelbot/consts.py` | Added `BuildkiteGPU` enum, `BUILDKITE` scheduler | +| `src/libkernelbot/launchers/__init__.py` | Export `BuildkiteLauncher` | +| `src/kernelbot/env.py` | Buildkite env vars | +| `src/kernelbot/main.py` | Register launcher if token set | + +### Key Code + +**BuildkiteLauncher** (`src/libkernelbot/launchers/buildkite.py`): +```python +class BuildkiteLauncher(Launcher): + def __init__(self, org: str, pipeline: str, token: str): + super().__init__(name="Buildkite", gpus=BuildkiteGPU) + # ... + + async def run_submission(self, config, gpu_type, status) -> FullResult: + # 1. Compress config (zlib + base64) + # 2. Create build via Buildkite API + # 3. Poll for completion + # 4. Download artifacts + # 5. Parse result.json -> FullResult +``` + +**Agent Setup** (`scripts/buildkite/setup-agent.sh`): +```bash +# Creates per-GPU systemd service with: +Environment="CUDA_VISIBLE_DEVICES=${GPU_INDEX}" +Environment="BUILDKITE_AGENT_TAGS=queue=${QUEUE_NAME}" +Slice=buildkite-gpu${GPU_INDEX}.slice # cgroup isolation +``` + +--- + +## Testing + +### Unit Tests (All Passing) + +```bash +uv run pytest tests/test_buildkite.py -v +``` + +``` +tests/test_buildkite.py::TestBuildkiteGPU::test_enum_values PASSED +tests/test_buildkite.py::TestBuildkiteGPU::test_scheduler_type_exists PASSED +tests/test_buildkite.py::TestBuildkiteGPU::test_gpu_lookup PASSED +tests/test_buildkite.py::TestBuildkiteLauncher::test_init PASSED +tests/test_buildkite.py::TestBuildkiteLauncher::test_headers PASSED +tests/test_buildkite.py::TestBuildkiteLauncher::test_payload_compression PASSED +tests/test_buildkite.py::TestBuildkiteLauncher::test_run_submission_creates_build PASSED +tests/test_buildkite.py::TestBuildkiteLauncher::test_run_submission_handles_api_error PASSED +tests/test_buildkite.py::TestBuildkiteLauncher::test_status_updates PASSED +tests/test_buildkite.py::TestBuildkiteRunner::test_runner_script_syntax PASSED + +============================== 10 passed ============================== +``` + +### Import/Integration Tests + +```bash +# Verify imports work +uv run python -c " +from libkernelbot.launchers import BuildkiteLauncher +from libkernelbot.consts import BuildkiteGPU, get_gpu_by_name + +launcher = BuildkiteLauncher(org='test', pipeline='test', token='fake') +print(f'Launcher: {launcher.name}') +print(f'GPUs: {[g.value for g in BuildkiteGPU]}') +" +``` + +Output: +``` +Launcher: Buildkite +GPUs: ['nvidia-h100', 'nvidia-b200', 'nvidia-a100', 'amd-mi300', 'amd-mi250', 'google-tpu', 'nebius-h100'] +``` + +### Local Container Test (For Vendors) + +```bash +# Build image +docker build -t kernelbot-runner:test -f docker/kernelbot-runner/Dockerfile . + +# Test with single GPU +docker run --rm --gpus '"device=0"' \ + -e SUBMISSION_PAYLOAD="$(python3 -c ' +import json, zlib, base64 +config = {"lang": "py", "mode": "test", "files": {"main.py": "import torch; print(torch.cuda.get_device_name(0))"}, "tests": [], "benchmarks": [], "test_timeout": 60, "benchmark_timeout": 60, "ranked_timeout": 60} +print(base64.b64encode(zlib.compress(json.dumps(config).encode())).decode()) +')" \ + kernelbot-runner:test +``` + +--- + +## Vendor Onboarding Flow + +### For Vendors + +1. **Get Buildkite token** from Kernelbot team +2. **Clone repo**: `git clone https://github.com/gpu-mode/kernelbot.git` +3. **Pull image**: `docker pull ghcr.io/gpu-mode/kernelbot-runner:latest` +4. **Run setup script** for each GPU: + ```bash + sudo ./scripts/buildkite/setup-agent.sh 0 nvidia-h100-0 32G 16 + sudo ./scripts/buildkite/setup-agent.sh 1 nvidia-h100-1 32G 16 + # ... for all GPUs + ``` +5. **Set token**: Edit `/etc/buildkite-agent/token` +6. **Start agents**: `sudo systemctl start 'buildkite-agent-gpu*'` +7. **Verify**: Check Buildkite dashboard for connected agents + +### For Kernelbot Team + +1. Set env vars: + ```bash + BUILDKITE_API_TOKEN=bkua_xxxxx + BUILDKITE_ORG=gpu-mode + BUILDKITE_PIPELINE=kernelbot-runner + ``` +2. Launcher auto-registers if token is set +3. Add GPU types to leaderboard configs + +--- + +## Next Steps + +### Immediate (For Integration Testing) + +- [ ] Create Buildkite organization and pipeline +- [ ] Set up 1 test agent on a GPU machine +- [ ] Run end-to-end test via API +- [ ] Compare benchmark results with Modal + +### Before Production + +- [ ] Set up GitHub Container Registry for image +- [ ] Configure Slack webhook for vendor notifications +- [ ] Test with multiple concurrent jobs +- [ ] Document SLA expectations for vendors + +### Future Enhancements + +- [ ] Webhook-based completion (instead of polling) +- [ ] Agent health monitoring dashboard +- [ ] Automatic image version checking +- [ ] Support for non-NVIDIA GPUs (TPU, AMD) + +--- + +## Configuration Reference + +### Environment Variables + +| Variable | Required | Default | Description | +|----------|----------|---------|-------------| +| `BUILDKITE_API_TOKEN` | Yes | - | Buildkite API token | +| `BUILDKITE_ORG` | No | `gpu-mode` | Buildkite org slug | +| `BUILDKITE_PIPELINE` | No | `kernelbot-runner` | Pipeline slug | + +### GPU Types (BuildkiteGPU Enum) + +| Enum Name | Queue Value | Description | +|-----------|-------------|-------------| +| `NVIDIA_H100` | `nvidia-h100` | NVIDIA H100 | +| `NVIDIA_B200` | `nvidia-b200` | NVIDIA B200 | +| `NVIDIA_A100` | `nvidia-a100` | NVIDIA A100 | +| `AMD_MI300` | `amd-mi300` | AMD MI300 | +| `AMD_MI250` | `amd-mi250` | AMD MI250 | +| `GOOGLE_TPU` | `google-tpu` | Google TPU | +| `NEBIUS_H100` | `nebius-h100` | Nebius H100 | + +--- + +## Files Reference + +``` +kernelbot/ +├── .buildkite/ +│ └── pipeline.yml # Buildkite pipeline +├── .github/workflows/ +│ └── build-runner-image.yml # Auto-build Docker image +├── docker/kernelbot-runner/ +│ ├── Dockerfile # Runner container +│ └── requirements-runner.txt # Python deps +├── docs/docs/vendor-onboarding/ +│ ├── buildkite.md # Vendor setup guide +│ └── testing-guide.md # Testing instructions +├── scripts/buildkite/ +│ ├── setup-agent.sh # Agent bootstrap +│ └── update-image.sh # Image updater +├── src/ +│ ├── kernelbot/ +│ │ ├── env.py # +Buildkite env vars +│ │ └── main.py # +Register launcher +│ ├── libkernelbot/ +│ │ ├── consts.py # +BuildkiteGPU enum +│ │ └── launchers/ +│ │ ├── __init__.py # +Export +│ │ └── buildkite.py # BuildkiteLauncher +│ └── runners/ +│ └── buildkite-runner.py # Runner script +└── tests/ + └── test_buildkite.py # Unit tests +``` + +--- + +## Contact + +- **Implementation**: [Your name] +- **Questions**: #kernelbot-infra on Discord +- **Issues**: https://github.com/gpu-mode/kernelbot/issues diff --git a/docs/docs/vendor-onboarding/buildkite.md b/docs/docs/vendor-onboarding/buildkite.md new file mode 100644 index 00000000..8250e9c4 --- /dev/null +++ b/docs/docs/vendor-onboarding/buildkite.md @@ -0,0 +1,243 @@ +--- +sidebar_position: 1 +--- + +# Buildkite Vendor Onboarding + +This guide explains how to set up Buildkite agents on your hardware to run GPU kernel competitions for Kernelbot. + +## Overview + +Kernelbot uses Buildkite to run GPU kernel submissions on vendor-donated hardware. Each GPU on your machine runs as an isolated Buildkite agent with: + +- **GPU Isolation**: Single GPU per agent via `CUDA_VISIBLE_DEVICES` +- **CPU/RAM Limits**: Resource constraints via systemd cgroups +- **Queue Routing**: Jobs routed to specific GPUs via queue tags + +## Prerequisites + +Before setting up agents, ensure you have: + +1. **Linux server** with NVIDIA GPUs (Ubuntu 22.04+ recommended) +2. **Docker** installed with [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html) +3. **Buildkite agent** installed ([installation guide](https://buildkite.com/docs/agent/v3/installation)) +4. **Buildkite organization token** from the Kernelbot team + +### Verify Prerequisites + +```bash +# Check NVIDIA driver +nvidia-smi + +# Check Docker with GPU support +docker run --rm --gpus all nvidia/cuda:13.1.0-base-ubuntu24.04 nvidia-smi + +# Check Buildkite agent +buildkite-agent --version +``` + +## Queue Naming Convention + +Queues follow the pattern: `{vendor}-{gpu_type}-{index}` + +Examples: +- `nvidia-h100-0` - NVIDIA-donated H100, GPU index 0 +- `nvidia-h100-1` - NVIDIA-donated H100, GPU index 1 +- `amd-mi300-0` - AMD-donated MI300 +- `google-tpu-0` - Google-donated TPU +- `nebius-h100-0` - Nebius-donated H100 + +Contact the Kernelbot team to register your queue names. + +## Setup Instructions + +### Step 1: Clone the Repository + +```bash +git clone https://github.com/gpu-mode/kernelbot.git +cd kernelbot +``` + +### Step 2: Pull the Runner Image + +```bash +docker pull ghcr.io/gpu-mode/kernelbot-runner:latest +``` + +### Step 3: Configure Agents + +Run the setup script for each GPU. For an 8-GPU node: + +```bash +# Set your Buildkite token +echo "BUILDKITE_AGENT_TOKEN=your-token-here" | sudo tee /etc/buildkite-agent/token + +# Set up each GPU (adjust queue names for your vendor) +sudo ./scripts/buildkite/setup-agent.sh 0 nvidia-h100-0 32G 16 +sudo ./scripts/buildkite/setup-agent.sh 1 nvidia-h100-1 32G 16 +sudo ./scripts/buildkite/setup-agent.sh 2 nvidia-h100-2 32G 16 +# ... repeat for all GPUs +``` + +Arguments: +- `GPU_INDEX`: GPU device index (0, 1, 2, ...) +- `QUEUE_NAME`: Queue name following convention above +- `MEMORY_LIMIT`: RAM limit per agent (default: 32G) +- `CPU_CORES`: CPU cores per agent (default: 16) + +### Step 4: Start Agents + +```bash +# Start all GPU agents +sudo systemctl start buildkite-agent-gpu0 +sudo systemctl start buildkite-agent-gpu1 +# ... etc + +# Or start all at once +sudo systemctl start 'buildkite-agent-gpu*' +``` + +### Step 5: Verify Setup + +```bash +# Check agent status +sudo systemctl status buildkite-agent-gpu0 + +# View logs +sudo journalctl -u buildkite-agent-gpu0 -f + +# Verify agent appears in Buildkite dashboard +# https://buildkite.com/organizations//agents +``` + +## Testing Your Setup + +### Local Test (Without Buildkite) + +Test the runner image directly: + +```bash +# Create a test payload +TEST_PAYLOAD=$(python3 -c " +import json, zlib, base64 +config = { + 'lang': 'py', + 'mode': 'test', + 'files': {'main.py': 'print(\"Hello GPU!\")'}, + 'tests': [], + 'benchmarks': [] +} +print(base64.b64encode(zlib.compress(json.dumps(config).encode())).decode()) +") + +# Run in container (single GPU) +docker run --rm --gpus '"device=0"' \ + -e SUBMISSION_PAYLOAD="$TEST_PAYLOAD" \ + ghcr.io/gpu-mode/kernelbot-runner:latest + +# Check if result.json would be created +ls -la result.json +``` + +### Integration Test (Via Buildkite) + +Trigger a test build: + +```bash +curl -X POST "https://api.buildkite.com/v2/organizations/gpu-mode/pipelines/kernelbot-runner/builds" \ + -H "Authorization: Bearer $BUILDKITE_API_TOKEN" \ + -H "Content-Type: application/json" \ + -d '{ + "commit": "HEAD", + "branch": "main", + "message": "Test submission", + "env": { + "GPU_QUEUE": "your-queue-name", + "SUBMISSION_PAYLOAD": "'"$TEST_PAYLOAD"'" + } + }' +``` + +Check the Buildkite dashboard for job results. + +### Isolation Verification + +Verify GPU and resource isolation: + +```bash +# Inside agent container, verify only 1 GPU visible +docker run --rm --gpus '"device=0"' nvidia/cuda:13.1.0-base-ubuntu24.04 nvidia-smi +# Should show only GPU 0 + +# Verify cgroup limits +cat /sys/fs/cgroup/buildkite-gpu0.slice/memory.max +cat /sys/fs/cgroup/buildkite-gpu0.slice/cpu.max +``` + +## Updating the Runner Image + +When notified of a new image release: + +```bash +sudo ./scripts/buildkite/update-image.sh +``` + +This pulls the latest image and optionally restarts agents. + +### Automatic Updates (Optional) + +Set up a cron job for automatic updates: + +```bash +# Check for updates daily at 3 AM +echo "0 3 * * * root /path/to/kernelbot/scripts/buildkite/update-image.sh --auto" | sudo tee /etc/cron.d/kernelbot-update +``` + +## Troubleshooting + +### Agent Not Picking Up Jobs + +1. Check agent is running: `systemctl status buildkite-agent-gpu0` +2. Verify queue tag matches: check `/etc/buildkite-agent/agent-0/buildkite-agent.cfg` +3. Ensure agent appears in Buildkite dashboard + +### GPU Not Visible in Container + +1. Check NVIDIA Container Toolkit: `docker run --rm --gpus all nvidia/cuda:13.1.0-base-ubuntu24.04 nvidia-smi` +2. Verify CUDA_VISIBLE_DEVICES is set correctly in systemd unit +3. Check Docker runtime config: `docker info | grep -i runtime` + +### Jobs Timing Out + +1. Check resource limits aren't too restrictive +2. Review job logs in Buildkite dashboard +3. Test image locally first + +### Memory/CPU Limits Not Working + +1. Verify cgroup v2 is enabled: `mount | grep cgroup2` +2. Check slice file exists: `cat /etc/systemd/system/buildkite-gpu0.slice` +3. Reload systemd: `systemctl daemon-reload` + +## Support + +- **Slack**: #kernelbot-infra in GPU Mode Discord +- **Issues**: https://github.com/gpu-mode/kernelbot/issues +- **Email**: infra@gpu-mode.org + +## Hardware Requirements + +Per GPU agent: + +| Resource | Minimum | Recommended | +|----------|---------|-------------| +| RAM | 16 GB | 32 GB | +| CPU Cores | 8 | 16 | +| Disk | 50 GB | 100 GB | +| Network | 100 Mbps | 1 Gbps | + +For an 8-GPU node, plan for: +- 256 GB RAM (32 GB per GPU) +- 128 CPU cores (16 per GPU) +- 800 GB disk +- Fast network for image pulls diff --git a/docs/docs/vendor-onboarding/testing-guide.md b/docs/docs/vendor-onboarding/testing-guide.md new file mode 100644 index 00000000..4cc1c703 --- /dev/null +++ b/docs/docs/vendor-onboarding/testing-guide.md @@ -0,0 +1,519 @@ +--- +sidebar_position: 2 +--- + +# Buildkite Testing Guide + +This guide covers how to test the Buildkite integration at various levels: local development, vendor validation, and end-to-end integration. + +## Testing Levels + +| Level | Who | Purpose | +|-------|-----|---------| +| Unit Tests | Kernelbot developers | Test launcher logic with mocked API | +| Local Container | Vendors | Verify runner image works with GPU | +| Agent Integration | Vendors | Verify agent picks up and runs jobs | +| End-to-End | Both | Full submission flow through Discord/API | + +--- + +## 1. Unit Tests (Kernelbot Developers) + +### Test BuildkiteLauncher with Mocked API + +```python +# tests/test_buildkite_launcher.py +import pytest +from unittest.mock import AsyncMock, patch, MagicMock +import json +import base64 +import zlib + +from libkernelbot.launchers.buildkite import BuildkiteLauncher +from libkernelbot.consts import BuildkiteGPU, GPU + + +@pytest.fixture +def launcher(): + return BuildkiteLauncher( + org="test-org", + pipeline="test-pipeline", + token="test-token" + ) + + +@pytest.fixture +def mock_config(): + return { + "lang": "py", + "mode": "test", + "files": {"main.py": "print('hello')"}, + "tests": [], + "benchmarks": [], + "test_timeout": 180, + } + + +@pytest.fixture +def gpu_type(): + return GPU(name="NVIDIA_H100", value="nvidia-h100", runner="Buildkite") + + +class TestBuildkiteLauncher: + def test_init(self, launcher): + assert launcher.name == "Buildkite" + assert launcher.org == "test-org" + assert launcher.pipeline == "test-pipeline" + assert launcher.gpus == BuildkiteGPU + + @pytest.mark.asyncio + async def test_run_submission_creates_build(self, launcher, mock_config, gpu_type): + """Test that run_submission creates a Buildkite build.""" + mock_response = MagicMock() + mock_response.json.return_value = { + "number": 123, + "web_url": "https://buildkite.com/test/builds/123", + "state": "scheduled", + } + mock_response.raise_for_status = MagicMock() + + with patch("requests.post", return_value=mock_response) as mock_post: + with patch.object(launcher, "_wait_for_completion", new_callable=AsyncMock): + with patch.object(launcher, "_download_and_parse_result", new_callable=AsyncMock) as mock_download: + mock_download.return_value = MagicMock(success=True) + + mock_status = AsyncMock() + result = await launcher.run_submission(mock_config, gpu_type, mock_status) + + # Verify API was called + mock_post.assert_called_once() + call_args = mock_post.call_args + + # Check URL + assert "test-org" in call_args[0][0] + assert "test-pipeline" in call_args[0][0] + + # Check payload was compressed + body = call_args[1]["json"] + assert "SUBMISSION_PAYLOAD" in body["env"] + assert body["env"]["GPU_QUEUE"] == "nvidia-h100" + + @pytest.mark.asyncio + async def test_payload_compression(self, launcher, mock_config, gpu_type): + """Test that config is properly compressed.""" + captured_payload = None + + def capture_post(*args, **kwargs): + nonlocal captured_payload + captured_payload = kwargs["json"]["env"]["SUBMISSION_PAYLOAD"] + response = MagicMock() + response.json.return_value = {"number": 1, "web_url": "http://test", "state": "scheduled"} + response.raise_for_status = MagicMock() + return response + + with patch("requests.post", side_effect=capture_post): + with patch.object(launcher, "_wait_for_completion", new_callable=AsyncMock): + with patch.object(launcher, "_download_and_parse_result", new_callable=AsyncMock): + mock_status = AsyncMock() + await launcher.run_submission(mock_config, gpu_type, mock_status) + + # Decompress and verify + decompressed = zlib.decompress(base64.b64decode(captured_payload)).decode() + parsed = json.loads(decompressed) + assert parsed["lang"] == "py" + assert parsed["mode"] == "test" +``` + +### Run Unit Tests + +```bash +pytest tests/test_buildkite_launcher.py -v +``` + +--- + +## 2. Local Container Tests (Vendors) + +### 2.1 Basic Image Test + +Verify the image runs and has correct dependencies: + +```bash +# Pull the image +docker pull ghcr.io/gpu-mode/kernelbot-runner:latest + +# Check Python and dependencies +docker run --rm ghcr.io/gpu-mode/kernelbot-runner:latest python --version +docker run --rm ghcr.io/gpu-mode/kernelbot-runner:latest pip list | grep torch + +# Check GPU access +docker run --rm --gpus all ghcr.io/gpu-mode/kernelbot-runner:latest nvidia-smi +``` + +### 2.2 Single GPU Isolation Test + +Verify only the specified GPU is visible: + +```bash +# Should only show GPU 0 +docker run --rm --gpus '"device=0"' ghcr.io/gpu-mode/kernelbot-runner:latest nvidia-smi + +# Should only show GPU 1 +docker run --rm --gpus '"device=1"' ghcr.io/gpu-mode/kernelbot-runner:latest nvidia-smi +``` + +### 2.3 Runner Script Test + +Test the runner with a simple payload: + +```bash +# Create test payload +create_test_payload() { + python3 -c " +import json, zlib, base64 +config = { + 'lang': 'py', + 'mode': 'test', + 'files': { + 'main.py': ''' +import torch +print(f\"PyTorch version: {torch.__version__}\") +print(f\"CUDA available: {torch.cuda.is_available()}\") +if torch.cuda.is_available(): + print(f\"GPU: {torch.cuda.get_device_name(0)}\") + print(f\"GPU count: {torch.cuda.device_count()}\") +''' + }, + 'tests': [], + 'benchmarks': [], + 'test_timeout': 60, + 'benchmark_timeout': 60, + 'ranked_timeout': 60, +} +print(base64.b64encode(zlib.compress(json.dumps(config).encode())).decode()) +" +} + +# Run with payload +docker run --rm --gpus '"device=0"' \ + -e SUBMISSION_PAYLOAD="$(create_test_payload)" \ + -v "$(pwd)/test-output:/workdir" \ + -w /workdir \ + ghcr.io/gpu-mode/kernelbot-runner:latest + +# Check output +cat test-output/result.json | jq . +``` + +### 2.4 CUDA Kernel Test + +Test a simple CUDA kernel submission: + +```bash +create_cuda_payload() { + python3 -c " +import json, zlib, base64 +config = { + 'lang': 'py', + 'mode': 'test', + 'files': { + 'main.py': ''' +import torch +import torch.nn as nn + +# Simple GPU operation +x = torch.randn(1000, 1000, device=\"cuda\") +y = torch.randn(1000, 1000, device=\"cuda\") +z = torch.matmul(x, y) +print(f\"Matrix multiply result shape: {z.shape}\") +print(f\"Result sum: {z.sum().item():.2f}\") +''' + }, + 'tests': [], + 'benchmarks': [], + 'test_timeout': 60, + 'benchmark_timeout': 60, + 'ranked_timeout': 60, +} +print(base64.b64encode(zlib.compress(json.dumps(config).encode())).decode()) +" +} + +docker run --rm --gpus '"device=0"' \ + -e SUBMISSION_PAYLOAD="$(create_cuda_payload)" \ + -v "$(pwd)/test-output:/workdir" \ + -w /workdir \ + ghcr.io/gpu-mode/kernelbot-runner:latest +``` + +### 2.5 Resource Limit Test + +Test memory limits are enforced: + +```bash +# Run with memory limit +docker run --rm --gpus '"device=0"' \ + --memory=4g \ + -e SUBMISSION_PAYLOAD="$(create_test_payload)" \ + ghcr.io/gpu-mode/kernelbot-runner:latest + +# Check container saw the limit +docker run --rm --memory=4g ghcr.io/gpu-mode/kernelbot-runner:latest \ + cat /sys/fs/cgroup/memory.max +``` + +--- + +## 3. Agent Integration Tests (Vendors) + +### 3.1 Agent Health Check + +After setting up agents, verify they're healthy: + +```bash +# Check systemd service status +sudo systemctl status buildkite-agent-gpu0 +sudo systemctl status buildkite-agent-gpu1 + +# Check agent logs +sudo journalctl -u buildkite-agent-gpu0 --since "5 minutes ago" + +# Verify agent appears in Buildkite dashboard +curl -s -H "Authorization: Bearer $BUILDKITE_API_TOKEN" \ + "https://api.buildkite.com/v2/organizations/gpu-mode/agents" | jq '.[] | {name, connection_state, metadata}' +``` + +### 3.2 Cgroup Isolation Verification + +Verify resource isolation is working: + +```bash +# Check memory limit +cat /sys/fs/cgroup/buildkite-gpu0.slice/memory.max +# Should show your configured limit (e.g., 34359738368 for 32G) + +# Check CPU quota +cat /sys/fs/cgroup/buildkite-gpu0.slice/cpu.max +# Should show something like "1600000 100000" for 16 cores + +# Verify agent is in the slice +systemctl status buildkite-agent-gpu0 | grep "CGroup" +``` + +### 3.3 GPU Binding Verification + +Verify each agent only sees its assigned GPU: + +```bash +# Check what GPU agent 0 sees +sudo -u buildkite CUDA_VISIBLE_DEVICES=0 nvidia-smi -L +# Should show only GPU 0 + +# Check what GPU agent 1 sees +sudo -u buildkite CUDA_VISIBLE_DEVICES=1 nvidia-smi -L +# Should show only GPU 1 +``` + +### 3.4 Trigger Test Build + +Trigger a test build and verify it runs on correct agent: + +```bash +# Create a test payload +TEST_PAYLOAD=$(python3 -c " +import json, zlib, base64 +config = { + 'lang': 'py', + 'mode': 'test', + 'files': {'main.py': 'import torch; print(torch.cuda.get_device_name(0))'}, + 'tests': [], + 'benchmarks': [], + 'test_timeout': 60, + 'benchmark_timeout': 60, + 'ranked_timeout': 60, +} +print(base64.b64encode(zlib.compress(json.dumps(config).encode())).decode()) +") + +# Trigger build on specific queue +curl -X POST "https://api.buildkite.com/v2/organizations/gpu-mode/pipelines/kernelbot-runner/builds" \ + -H "Authorization: Bearer $BUILDKITE_API_TOKEN" \ + -H "Content-Type: application/json" \ + -d '{ + "commit": "HEAD", + "branch": "main", + "message": "Agent integration test", + "env": { + "GPU_QUEUE": "nvidia-h100-0", + "SUBMISSION_PAYLOAD": "'"$TEST_PAYLOAD"'" + } + }' | jq '{number, web_url, state}' + +# Watch the build +# Check Buildkite dashboard or poll API +``` + +### 3.5 Concurrent Job Test + +Verify jobs don't interfere with each other: + +```bash +# Trigger jobs on different GPUs simultaneously +for i in 0 1 2 3; do + curl -X POST "https://api.buildkite.com/v2/organizations/gpu-mode/pipelines/kernelbot-runner/builds" \ + -H "Authorization: Bearer $BUILDKITE_API_TOKEN" \ + -H "Content-Type: application/json" \ + -d '{ + "commit": "HEAD", + "branch": "main", + "message": "Concurrent test GPU '"$i"'", + "env": { + "GPU_QUEUE": "nvidia-h100-'"$i"'", + "SUBMISSION_PAYLOAD": "'"$TEST_PAYLOAD"'" + } + }' & +done +wait + +# All 4 should run in parallel on different agents +``` + +--- + +## 4. End-to-End Tests (Full System) + +### 4.1 API Submission Test + +Test the full flow through Kernelbot's API: + +```bash +# This requires the full Kernelbot stack running +# Submit via API endpoint +curl -X POST "http://localhost:8000/leaderboard/test-leaderboard/nvidia-h100/test" \ + -H "Content-Type: application/json" \ + -d '{ + "code": "import torch; print(torch.cuda.get_device_name(0))", + "user_id": "test-user", + "user_name": "Test User" + }' +``` + +### 4.2 Discord Bot Test + +Test submission via Discord (manual): + +1. Go to the Discord server with Kernelbot +2. Use `/leaderboard submit test` command +3. Select a Buildkite GPU type (e.g., `nvidia-h100`) +4. Upload a test script +5. Verify the submission runs and returns results + +### 4.3 Benchmark Accuracy Test + +Compare results between launchers: + +```bash +# Run same benchmark on Modal and Buildkite +# Results should be within acceptable variance (< 5% for microbenchmarks) + +# This requires a benchmark that runs on both +# Compare the timing results in the database +``` + +--- + +## 5. Troubleshooting Tests + +### 5.1 Timeout Behavior + +Test that timeouts work correctly: + +```bash +# Create a payload that times out +TIMEOUT_PAYLOAD=$(python3 -c " +import json, zlib, base64 +config = { + 'lang': 'py', + 'mode': 'test', + 'files': {'main.py': 'import time; time.sleep(300)'}, # 5 minutes + 'tests': [], + 'benchmarks': [], + 'test_timeout': 10, # 10 second timeout + 'benchmark_timeout': 10, + 'ranked_timeout': 10, +} +print(base64.b64encode(zlib.compress(json.dumps(config).encode())).decode()) +") + +# Should timeout after ~10 seconds +docker run --rm --gpus '"device=0"' \ + -e SUBMISSION_PAYLOAD="$TIMEOUT_PAYLOAD" \ + ghcr.io/gpu-mode/kernelbot-runner:latest +``` + +### 5.2 Error Handling + +Test error cases: + +```bash +# Missing GPU +docker run --rm \ + -e SUBMISSION_PAYLOAD="$TEST_PAYLOAD" \ + ghcr.io/gpu-mode/kernelbot-runner:latest +# Should fail gracefully with error in result.json + +# Invalid payload +docker run --rm --gpus '"device=0"' \ + -e SUBMISSION_PAYLOAD="not-valid-base64" \ + ghcr.io/gpu-mode/kernelbot-runner:latest +# Should fail with clear error message + +# Missing payload +docker run --rm --gpus '"device=0"' \ + ghcr.io/gpu-mode/kernelbot-runner:latest +# Should fail with "SUBMISSION_PAYLOAD not set" error +``` + +### 5.3 Agent Recovery + +Test agent recovers from failures: + +```bash +# Kill the agent process +sudo systemctl kill -s SIGKILL buildkite-agent-gpu0 + +# Check it restarts automatically +sleep 5 +sudo systemctl status buildkite-agent-gpu0 +# Should show "active (running)" +``` + +--- + +## Test Checklist + +Use this checklist before going live: + +### Vendor Checklist + +- [ ] Image pulls successfully +- [ ] Image runs with GPU access +- [ ] Single GPU isolation works +- [ ] Runner script executes test payload +- [ ] CUDA operations work in container +- [ ] All agents show as connected in Buildkite +- [ ] Cgroup limits are enforced +- [ ] Test build completes successfully +- [ ] Artifacts are uploaded correctly +- [ ] Agent restarts after failure + +### Developer Checklist + +- [ ] Unit tests pass +- [ ] BuildkiteLauncher creates builds +- [ ] Polling works correctly +- [ ] Artifacts are downloaded and parsed +- [ ] Timeouts are handled +- [ ] Errors return proper FullResult +- [ ] GPU enum is registered correctly +- [ ] Launcher is registered in main.py diff --git a/scripts/buildkite/setup-agent.sh b/scripts/buildkite/setup-agent.sh new file mode 100755 index 00000000..9f6e4821 --- /dev/null +++ b/scripts/buildkite/setup-agent.sh @@ -0,0 +1,157 @@ +#!/bin/bash +# +# Buildkite Agent Setup Script for Kernelbot +# +# This script configures a Buildkite agent for a single GPU with proper isolation. +# Each GPU on the node should have its own agent with dedicated resources. +# +# Usage: +# sudo ./setup-agent.sh [memory_limit] [cpu_cores] +# +# Examples: +# sudo ./setup-agent.sh 0 nvidia-h100-0 32G 16 +# sudo ./setup-agent.sh 1 nvidia-h100-1 32G 16 +# +# Prerequisites: +# - Buildkite agent installed: https://buildkite.com/docs/agent/v3/installation +# - Docker installed with NVIDIA runtime +# - BUILDKITE_AGENT_TOKEN set in environment or passed via config +# +# What this script does: +# 1. Creates a systemd service for the agent bound to specific GPU +# 2. Creates a cgroup slice for CPU/RAM isolation +# 3. Configures agent with queue tags for job routing +# + +set -euo pipefail + +# Parse arguments +GPU_INDEX="${1:?GPU index required (e.g., 0, 1, 2...)}" +QUEUE_NAME="${2:?Queue name required (e.g., nvidia-h100-0)}" +MEMORY_LIMIT="${3:-32G}" +CPU_CORES="${4:-16}" + +# Validate GPU exists +if ! nvidia-smi -i "$GPU_INDEX" &>/dev/null; then + echo "Error: GPU $GPU_INDEX not found" + nvidia-smi -L + exit 1 +fi + +echo "Setting up Buildkite agent for GPU $GPU_INDEX with queue $QUEUE_NAME" +echo " Memory limit: $MEMORY_LIMIT" +echo " CPU cores: $CPU_CORES" + +# Create buildkite user if it doesn't exist +if ! id buildkite &>/dev/null; then + echo "Creating buildkite user..." + useradd -r -m -s /bin/bash buildkite + usermod -aG docker buildkite +fi + +# Create agent config directory +AGENT_CONFIG_DIR="/etc/buildkite-agent/agent-${GPU_INDEX}" +mkdir -p "$AGENT_CONFIG_DIR" + +# Create agent configuration file +cat > "$AGENT_CONFIG_DIR/buildkite-agent.cfg" << EOF +# Buildkite Agent Configuration for GPU $GPU_INDEX +# Auto-generated by setup-agent.sh + +name="gpu-${GPU_INDEX}-%hostname-%n" +tags="queue=${QUEUE_NAME}" +build-path="/var/lib/buildkite-agent/builds-gpu${GPU_INDEX}" + +# Hooks directory (optional) +hooks-path="/etc/buildkite-agent/hooks" + +# Plugins directory +plugins-path="/var/lib/buildkite-agent/plugins" + +# Disconnect after job (for clean state) +disconnect-after-job=false +disconnect-after-idle-timeout=0 + +# Enable job log timestamps +timestamp-lines=true +EOF + +# Create build directory +mkdir -p "/var/lib/buildkite-agent/builds-gpu${GPU_INDEX}" +chown -R buildkite:buildkite "/var/lib/buildkite-agent/builds-gpu${GPU_INDEX}" + +# Create cgroup slice for resource isolation +cat > "/etc/systemd/system/buildkite-gpu${GPU_INDEX}.slice" << EOF +[Unit] +Description=Buildkite Agent Slice for GPU ${GPU_INDEX} +Before=slices.target + +[Slice] +MemoryMax=${MEMORY_LIMIT} +CPUQuota=$((CPU_CORES * 100))% +EOF + +# Create systemd service for this GPU +cat > "/etc/systemd/system/buildkite-agent-gpu${GPU_INDEX}.service" << EOF +[Unit] +Description=Buildkite Agent for GPU ${GPU_INDEX} (${QUEUE_NAME}) +Documentation=https://buildkite.com/docs/agent/v3 +After=network.target docker.service +Requires=docker.service + +[Service] +Type=simple +User=buildkite +Group=buildkite + +# GPU isolation - only this GPU is visible +Environment="CUDA_VISIBLE_DEVICES=${GPU_INDEX}" + +# Pass agent token (should be set in environment file) +EnvironmentFile=-/etc/buildkite-agent/token + +# Use agent-specific config +ExecStart=/usr/bin/buildkite-agent start --config ${AGENT_CONFIG_DIR}/buildkite-agent.cfg + +# Restart on failure +Restart=always +RestartSec=5 + +# Resource isolation via cgroup slice +Slice=buildkite-gpu${GPU_INDEX}.slice + +# Hardening +NoNewPrivileges=false +ProtectSystem=full +ProtectHome=read-only + +[Install] +WantedBy=multi-user.target +EOF + +# Create environment file for token if it doesn't exist +if [[ ! -f /etc/buildkite-agent/token ]]; then + cat > /etc/buildkite-agent/token << EOF +# Buildkite agent token - set this to your organization's token +# Get it from: https://buildkite.com/organizations//agents +BUILDKITE_AGENT_TOKEN= +EOF + chmod 600 /etc/buildkite-agent/token + echo "" + echo "⚠️ IMPORTANT: Set your Buildkite agent token in /etc/buildkite-agent/token" +fi + +# Reload systemd and enable the service +systemctl daemon-reload +systemctl enable "buildkite-agent-gpu${GPU_INDEX}.service" + +echo "" +echo "✅ Buildkite agent for GPU $GPU_INDEX configured successfully!" +echo "" +echo "Next steps:" +echo " 1. Set BUILDKITE_AGENT_TOKEN in /etc/buildkite-agent/token" +echo " 2. Start the agent: sudo systemctl start buildkite-agent-gpu${GPU_INDEX}" +echo " 3. Check status: sudo systemctl status buildkite-agent-gpu${GPU_INDEX}" +echo " 4. View logs: sudo journalctl -u buildkite-agent-gpu${GPU_INDEX} -f" +echo "" +echo "To set up additional GPUs, run this script with different GPU indices." diff --git a/scripts/buildkite/update-image.sh b/scripts/buildkite/update-image.sh new file mode 100755 index 00000000..402a91e1 --- /dev/null +++ b/scripts/buildkite/update-image.sh @@ -0,0 +1,50 @@ +#!/bin/bash +# +# Update Kernelbot Runner Image +# +# This script pulls the latest runner image and restarts agents to use it. +# Run this when notified of a new image release. +# +# Usage: +# sudo ./update-image.sh +# + +set -euo pipefail + +IMAGE="ghcr.io/gpu-mode/kernelbot-runner:latest" + +echo "Pulling latest kernelbot runner image..." +docker pull "$IMAGE" + +echo "" +echo "Image updated. Checking for running agents..." + +# Find all buildkite-agent-gpu* services +AGENTS=$(systemctl list-units --type=service --state=running --no-legend | grep 'buildkite-agent-gpu' | awk '{print $1}' || true) + +if [[ -z "$AGENTS" ]]; then + echo "No running Buildkite GPU agents found." + echo "Image will be used on next job run." +else + echo "Found running agents:" + echo "$AGENTS" + echo "" + read -p "Restart agents to use new image? (y/N) " -n 1 -r + echo "" + if [[ $REPLY =~ ^[Yy]$ ]]; then + for agent in $AGENTS; do + echo "Restarting $agent..." + systemctl restart "$agent" + done + echo "" + echo "✅ All agents restarted with new image." + else + echo "Agents will use new image on next job run." + fi +fi + +echo "" +echo "Current image info:" +docker inspect "$IMAGE" --format='ID: {{.Id}}' +docker inspect "$IMAGE" --format='Created: {{.Created}}' +docker inspect "$IMAGE" --format='Labels: {{json .Config.Labels}}' diff --git a/src/kernelbot/env.py b/src/kernelbot/env.py index b1758b63..4e0e3bbc 100644 --- a/src/kernelbot/env.py +++ b/src/kernelbot/env.py @@ -47,3 +47,8 @@ def init_environment(): # PostgreSQL-specific constants env.DATABASE_URL = os.getenv("DATABASE_URL") env.DISABLE_SSL = os.getenv("DISABLE_SSL") + +# Buildkite-specific constants (optional - for vendor-managed GPU runners) +env.BUILDKITE_API_TOKEN = os.getenv("BUILDKITE_API_TOKEN") +env.BUILDKITE_ORG = os.getenv("BUILDKITE_ORG", "gpu-mode") +env.BUILDKITE_PIPELINE = os.getenv("BUILDKITE_PIPELINE", "kernelbot-runner") diff --git a/src/kernelbot/main.py b/src/kernelbot/main.py index e0411096..2aff46d2 100644 --- a/src/kernelbot/main.py +++ b/src/kernelbot/main.py @@ -16,7 +16,7 @@ from libkernelbot import consts from libkernelbot.backend import KernelBackend from libkernelbot.background_submission_manager import BackgroundSubmissionManager -from libkernelbot.launchers import GitHubLauncher, ModalLauncher +from libkernelbot.launchers import BuildkiteLauncher, GitHubLauncher, ModalLauncher from libkernelbot.utils import setup_logging logger = setup_logging(__name__) @@ -43,6 +43,15 @@ def __init__(self, debug_mode=False): self.backend.register_launcher( GitHubLauncher(env.GITHUB_REPO, env.GITHUB_TOKEN, env.GITHUB_WORKFLOW_BRANCH) ) + # Register Buildkite launcher if configured (optional - for vendor-managed GPU runners) + if env.BUILDKITE_API_TOKEN: + self.backend.register_launcher( + BuildkiteLauncher( + org=env.BUILDKITE_ORG, + pipeline=env.BUILDKITE_PIPELINE, + token=env.BUILDKITE_API_TOKEN, + ) + ) @property def leaderboard_db(self): diff --git a/src/libkernelbot/consts.py b/src/libkernelbot/consts.py index f60764de..bbb37aad 100644 --- a/src/libkernelbot/consts.py +++ b/src/libkernelbot/consts.py @@ -14,6 +14,7 @@ class SchedulerType(Enum): GITHUB = "github" MODAL = "modal" SLURM = "slurm" + BUILDKITE = "buildkite" class GitHubGPU(Enum): @@ -23,6 +24,19 @@ class GitHubGPU(Enum): MI300x8 = "MI300x8" +class BuildkiteGPU(Enum): + # Queue naming: {vendor}-{gpu_type} + # Buildkite agents use tags like queue=nvidia-h100-0 for per-GPU routing + # The enum value is the queue prefix; agents append -N for specific GPU index + NVIDIA_H100 = "nvidia-h100" + NVIDIA_B200 = "nvidia-b200" + NVIDIA_A100 = "nvidia-a100" + AMD_MI300 = "amd-mi300" + AMD_MI250 = "amd-mi250" + GOOGLE_TPU = "google-tpu" + NEBIUS_H100 = "nebius-h100" + + class ModalGPU(Enum): T4 = "T4" L4 = "L4" @@ -50,7 +64,7 @@ def _make_gpu_lookup(runner_map: dict[str, Type[Enum]]): return lookup -_GPU_LOOKUP = _make_gpu_lookup({"Modal": ModalGPU, "GitHub": GitHubGPU}) +_GPU_LOOKUP = _make_gpu_lookup({"Modal": ModalGPU, "GitHub": GitHubGPU, "Buildkite": BuildkiteGPU}) def get_gpu_by_name(name: str) -> GPU: @@ -121,6 +135,14 @@ class RankCriterion(Enum): "MI300": None, "MI300x8": None, "MI250": None, + # Buildkite GPU types (vendor-prefixed queue names) + "nvidia-h100": "90a", + "nvidia-b200": "100", + "nvidia-a100": "80", + "nebius-h100": "90a", + "amd-mi300": None, + "amd-mi250": None, + "google-tpu": None, } diff --git a/src/libkernelbot/launchers/__init__.py b/src/libkernelbot/launchers/__init__.py index df47476f..1a7a8a39 100644 --- a/src/libkernelbot/launchers/__init__.py +++ b/src/libkernelbot/launchers/__init__.py @@ -1,5 +1,6 @@ +from .buildkite import BuildkiteLauncher from .github import GitHubLauncher from .launcher import Launcher from .modal import ModalLauncher -__all__ = [Launcher, GitHubLauncher, ModalLauncher] +__all__ = [Launcher, GitHubLauncher, ModalLauncher, BuildkiteLauncher] diff --git a/src/libkernelbot/launchers/buildkite.py b/src/libkernelbot/launchers/buildkite.py new file mode 100644 index 00000000..32542247 --- /dev/null +++ b/src/libkernelbot/launchers/buildkite.py @@ -0,0 +1,301 @@ +import asyncio +import base64 +import datetime +import json +import math +import zlib +from typing import Awaitable, Callable + +import requests + +from libkernelbot.consts import ( + DEFAULT_GITHUB_TIMEOUT_MINUTES, + GPU, + TIMEOUT_BUFFER_MINUTES, + BuildkiteGPU, + SubmissionMode, +) +from libkernelbot.report import RunProgressReporter +from libkernelbot.run_eval import ( + CompileResult, + EvalResult, + FullResult, + ProfileResult, + RunResult, + SystemInfo, +) +from libkernelbot.utils import setup_logging + +from .launcher import Launcher + +logger = setup_logging() + +# Buildkite API base URL +BUILDKITE_API_BASE = "https://api.buildkite.com/v2" + + +def get_timeout(config: dict) -> int: + """Get timeout in minutes from config, matching GitHub launcher pattern.""" + mode = config.get("mode") + sec_map = { + SubmissionMode.TEST.value: config.get("test_timeout"), + SubmissionMode.BENCHMARK.value: config.get("benchmark_timeout"), + SubmissionMode.LEADERBOARD.value: config.get("ranked_timeout"), + } + seconds = sec_map.get(mode) or DEFAULT_GITHUB_TIMEOUT_MINUTES * 60 + return math.ceil(seconds / 60) + + +class BuildkiteLauncher(Launcher): + """ + Launcher for Buildkite-based GPU runners. + + Buildkite agents are configured per-GPU with isolated resources: + - Each agent bound to single GPU via CUDA_VISIBLE_DEVICES + - CPU/RAM limits enforced via systemd cgroups + - Queue tags route jobs to specific GPU types (e.g., queue=nvidia-h100-0) + """ + + def __init__(self, org: str, pipeline: str, token: str): + """ + Initialize Buildkite launcher. + + Args: + org: Buildkite organization slug (e.g., "gpu-mode") + pipeline: Pipeline slug (e.g., "kernelbot-runner") + token: Buildkite API token with build creation permissions + """ + super().__init__(name="Buildkite", gpus=BuildkiteGPU) + self.org = org + self.pipeline = pipeline + self.token = token + self._headers = { + "Authorization": f"Bearer {token}", + "Content-Type": "application/json", + } + + async def run_submission( + self, config: dict, gpu_type: GPU, status: RunProgressReporter + ) -> FullResult: + """ + Run a submission on a Buildkite agent. + + Args: + config: Submission configuration dict + gpu_type: GPU type to run on (determines queue routing) + status: Progress reporter for user feedback + + Returns: + FullResult with compilation and run results + """ + # Compress config (same as GitHub launcher) + payload = base64.b64encode(zlib.compress(json.dumps(config).encode("utf-8"))).decode( + "utf-8" + ) + + # Create build via Buildkite API + build_url = f"{BUILDKITE_API_BASE}/organizations/{self.org}/pipelines/{self.pipeline}/builds" + + # Queue name from GPU type value (e.g., "nvidia-h100") + # Buildkite will route to any agent with matching queue tag + queue_name = gpu_type.value + + build_data = { + "commit": "HEAD", + "branch": "main", + "message": f"Kernel submission on {gpu_type.name}", + "env": { + "SUBMISSION_PAYLOAD": payload, + "GPU_QUEUE": queue_name, + }, + } + + logger.info(f"Creating Buildkite build for {gpu_type.name} on queue {queue_name}") + + try: + response = await asyncio.to_thread( + requests.post, build_url, headers=self._headers, json=build_data + ) + response.raise_for_status() + except requests.RequestException as e: + logger.error(f"Failed to create Buildkite build: {e}") + return FullResult( + success=False, + error=f"Failed to create Buildkite build: {str(e)}", + runs={}, + system=SystemInfo(), + ) + + build = response.json() + build_number = build["number"] + build_url_html = build["web_url"] + + logger.info(f"Created Buildkite build #{build_number}: {build_url_html}") + await status.push(f"⏳ Buildkite build [#{build_number}](<{build_url_html}>) started...") + + # Poll for completion + timeout = get_timeout(config) + TIMEOUT_BUFFER_MINUTES + build_api_url = f"{BUILDKITE_API_BASE}/organizations/{self.org}/pipelines/{self.pipeline}/builds/{build_number}" + + try: + await self._wait_for_completion( + build_api_url, + build_number, + build_url_html, + timeout, + lambda state, elapsed: self._status_callback( + status, build_number, build_url_html, state, elapsed + ), + ) + except TimeoutError as e: + logger.error(f"Buildkite build #{build_number} timed out") + return FullResult( + success=False, + error=str(e), + runs={}, + system=SystemInfo(), + ) + except Exception as e: + logger.error(f"Error waiting for Buildkite build: {e}") + return FullResult( + success=False, + error=f"Build error: {str(e)}", + runs={}, + system=SystemInfo(), + ) + + await status.update(f"✅ Build [#{build_number}](<{build_url_html}>) completed") + + # Download artifacts + await status.push("Downloading artifacts...") + logger.info(f"Downloading artifacts for build #{build_number}") + + try: + result = await self._download_and_parse_result(build_api_url) + await status.update("Downloading artifacts... done") + return result + except Exception as e: + logger.error(f"Failed to download artifacts: {e}") + await status.update("Downloading artifacts... failed") + return FullResult( + success=False, + error=f"Failed to download artifacts: {str(e)}", + runs={}, + system=SystemInfo(), + ) + + async def _wait_for_completion( + self, + build_api_url: str, + build_number: int, + build_url_html: str, + timeout_minutes: int, + callback: Callable[[str, float], Awaitable[None]], + ): + """Poll Buildkite API until build completes or times out.""" + start_time = datetime.datetime.now(datetime.timezone.utc) + timeout = datetime.timedelta(minutes=timeout_minutes) + + while True: + try: + response = await asyncio.to_thread( + requests.get, build_api_url, headers=self._headers + ) + response.raise_for_status() + build = response.json() + + elapsed = (datetime.datetime.now(datetime.timezone.utc) - start_time).total_seconds() + + if elapsed > timeout.total_seconds(): + # Try to cancel the build + cancel_url = f"{build_api_url}/cancel" + await asyncio.to_thread( + requests.put, cancel_url, headers=self._headers + ) + raise TimeoutError( + f"Build #{build_number} cancelled - exceeded {timeout_minutes} minute timeout" + ) + + state = build.get("state", "unknown") + + if state in ("passed", "failed", "canceled", "blocked"): + if state != "passed": + logger.warning(f"Build #{build_number} finished with state: {state}") + raise RuntimeError(f"Build #{build_number} {state}") + return state + + await callback(state, elapsed) + await asyncio.sleep(10) # Poll every 10 seconds + + except TimeoutError: + raise + except Exception as e: + logger.error(f"Error polling build status: {e}") + raise + + async def _status_callback( + self, + status: RunProgressReporter, + build_number: int, + build_url_html: str, + state: str, + elapsed: float, + ): + """Update status with current build state.""" + await status.update( + f"⏳ Build [#{build_number}](<{build_url_html}>): {state} ({elapsed:.1f}s)" + ) + + async def _download_and_parse_result(self, build_api_url: str) -> FullResult: + """Download artifacts and parse result.json.""" + # Get artifacts list + artifacts_url = f"{build_api_url}/artifacts" + response = await asyncio.to_thread( + requests.get, artifacts_url, headers=self._headers + ) + response.raise_for_status() + artifacts = response.json() + + # Find result.json artifact + result_artifact = None + profile_artifact = None + for artifact in artifacts: + if artifact.get("filename") == "result.json": + result_artifact = artifact + elif artifact.get("path", "").startswith("profile_data/"): + profile_artifact = artifact + + if not result_artifact: + raise RuntimeError("Could not find result.json artifact") + + # Download result.json + download_url = result_artifact.get("download_url") + response = await asyncio.to_thread( + requests.get, download_url, headers=self._headers + ) + response.raise_for_status() + + # Parse result + data = response.json() + runs = {} + + for k, v in data.get("runs", {}).items(): + comp_res = None if v.get("compilation") is None else CompileResult(**v["compilation"]) + run_res = None if v.get("run") is None else RunResult(**v["run"]) + profile_res = None if v.get("profile") is None else ProfileResult(**v["profile"]) + + # Add profile download URL if available + if profile_res is not None and profile_artifact: + profile_res.download_url = profile_artifact.get("download_url") + + res = EvalResult( + start=datetime.datetime.fromisoformat(v["start"]), + end=datetime.datetime.fromisoformat(v["end"]), + compilation=comp_res, + run=run_res, + profile=profile_res, + ) + runs[k] = res + + system = SystemInfo(**data.get("system", {})) + return FullResult(success=True, error="", runs=runs, system=system) diff --git a/src/runners/buildkite-runner.py b/src/runners/buildkite-runner.py new file mode 100644 index 00000000..aec139be --- /dev/null +++ b/src/runners/buildkite-runner.py @@ -0,0 +1,82 @@ +""" +Buildkite runner script for kernel submissions. + +This script runs inside a Docker container on Buildkite agents. +It reads the submission payload from the SUBMISSION_PAYLOAD environment variable, +executes the kernel, and writes results to result.json for artifact upload. + +The agent is pre-configured with: +- CUDA_VISIBLE_DEVICES bound to a single GPU +- CPU/RAM limits via systemd cgroups +""" + +import base64 +import json +import os +import sys +import traceback +import zlib +from dataclasses import asdict +from datetime import datetime +from pathlib import Path + +from libkernelbot.run_eval import run_config + + +def serialize(obj: object): + """JSON serializer for objects not serializable by default.""" + if isinstance(obj, datetime): + return obj.isoformat() + raise TypeError(f"Type {type(obj)} not serializable") + + +def write_error_result(error_message: str): + """Write an error result to result.json when execution fails.""" + error_result = { + "success": False, + "error": error_message, + "runs": {}, + "system": {}, + } + Path("result.json").write_text(json.dumps(error_result, default=serialize)) + + +def main(): + try: + # Get payload from environment variable + payload_b64 = os.environ.get("SUBMISSION_PAYLOAD") + if not payload_b64: + write_error_result("SUBMISSION_PAYLOAD environment variable not set") + sys.exit(1) + + # Decompress and parse config + try: + payload = zlib.decompress(base64.b64decode(payload_b64)).decode("utf-8") + config = json.loads(payload) + except Exception as e: + write_error_result(f"Failed to decompress/parse payload: {e}") + sys.exit(1) + + # Run the submission + result = run_config(config) + + # Write result to file for artifact upload + result_dict = asdict(result) + Path("result.json").write_text(json.dumps(result_dict, default=serialize)) + + # Create profile_data directory if profiling was enabled + # (profile artifacts will be written there by run_config) + profile_dir = Path("profile_data") + if profile_dir.exists(): + print(f"Profile data available in {profile_dir}") + + except Exception as e: + # Catch any unexpected errors and write them to result.json + error_msg = f"Runner error: {e}\n{traceback.format_exc()}" + print(error_msg, file=sys.stderr) + write_error_result(error_msg) + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/tests/test_buildkite.py b/tests/test_buildkite.py new file mode 100644 index 00000000..ac938ee4 --- /dev/null +++ b/tests/test_buildkite.py @@ -0,0 +1,196 @@ +"""Tests for BuildkiteLauncher.""" + +import base64 +import json +import zlib +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + +from libkernelbot.consts import BuildkiteGPU, GPU, SchedulerType, get_gpu_by_name +from libkernelbot.launchers import BuildkiteLauncher +from libkernelbot.report import RunProgressReporter + + +class MockProgressReporter(RunProgressReporter): + """Test progress reporter that captures messages.""" + + def __init__(self, title: str = "Test Buildkite Run"): + super().__init__(title) + self.messages = [] + self.updates = [] + + async def push(self, message: str): + self.messages.append(message) + + async def update(self, message: str): + self.updates.append(message) + + +class TestBuildkiteGPU: + """Tests for BuildkiteGPU enum.""" + + def test_enum_values(self): + """Test that BuildkiteGPU has expected values.""" + assert BuildkiteGPU.NVIDIA_H100.value == "nvidia-h100" + assert BuildkiteGPU.NVIDIA_B200.value == "nvidia-b200" + assert BuildkiteGPU.AMD_MI300.value == "amd-mi300" + assert BuildkiteGPU.GOOGLE_TPU.value == "google-tpu" + + def test_scheduler_type_exists(self): + """Test that BUILDKITE scheduler type exists.""" + assert SchedulerType.BUILDKITE.value == "buildkite" + + def test_gpu_lookup(self): + """Test that Buildkite GPUs are in the lookup table.""" + gpu = get_gpu_by_name("nvidia_h100") + assert gpu is not None + assert gpu.value == "nvidia-h100" + assert gpu.runner == "Buildkite" + + gpu = get_gpu_by_name("amd_mi300") + assert gpu is not None + assert gpu.value == "amd-mi300" + assert gpu.runner == "Buildkite" + + +class TestBuildkiteLauncher: + """Tests for BuildkiteLauncher class.""" + + @pytest.fixture + def launcher(self): + return BuildkiteLauncher( + org="test-org", + pipeline="test-pipeline", + token="test-token", + ) + + @pytest.fixture + def mock_config(self): + return { + "lang": "py", + "mode": "test", + "main": "main.py", + "sources": {"main.py": "print('hello')"}, + "tests": [], + "benchmarks": [], + "test_timeout": 180, + "benchmark_timeout": 180, + "ranked_timeout": 180, + } + + @pytest.fixture + def gpu_type(self): + return GPU(name="NVIDIA_H100", value="nvidia-h100", runner="Buildkite") + + def test_init(self, launcher): + """Test launcher initialization.""" + assert launcher.name == "Buildkite" + assert launcher.org == "test-org" + assert launcher.pipeline == "test-pipeline" + assert launcher.gpus == BuildkiteGPU + + def test_headers(self, launcher): + """Test API headers are set correctly.""" + assert "Authorization" in launcher._headers + assert launcher._headers["Authorization"] == "Bearer test-token" + assert launcher._headers["Content-Type"] == "application/json" + + def test_payload_compression(self, mock_config): + """Test that payload compression/decompression works.""" + # Compress (same logic as launcher) + payload = base64.b64encode( + zlib.compress(json.dumps(mock_config).encode("utf-8")) + ).decode("utf-8") + + # Decompress (same logic as runner) + decompressed = zlib.decompress(base64.b64decode(payload)).decode("utf-8") + restored = json.loads(decompressed) + + assert restored == mock_config + + @pytest.mark.asyncio + async def test_run_submission_creates_build(self, launcher, mock_config, gpu_type): + """Test that run_submission creates a Buildkite build.""" + mock_response = MagicMock() + mock_response.json.return_value = { + "number": 123, + "web_url": "https://buildkite.com/test/builds/123", + "state": "scheduled", + } + mock_response.raise_for_status = MagicMock() + + with patch("libkernelbot.launchers.buildkite.requests.post", return_value=mock_response) as mock_post: + with patch.object(launcher, "_wait_for_completion", new_callable=AsyncMock): + with patch.object(launcher, "_download_and_parse_result", new_callable=AsyncMock) as mock_download: + mock_download.return_value = MagicMock(success=True) + + reporter = MockProgressReporter() + result = await launcher.run_submission(mock_config, gpu_type, reporter) + + # Verify API was called + mock_post.assert_called_once() + call_args = mock_post.call_args + + # Check URL contains org and pipeline + url = call_args[0][0] + assert "test-org" in url + assert "test-pipeline" in url + + # Check payload was compressed and queue set + body = call_args[1]["json"] + assert "SUBMISSION_PAYLOAD" in body["env"] + assert body["env"]["GPU_QUEUE"] == "nvidia-h100" + + # Verify result + assert result.success is True + + @pytest.mark.asyncio + async def test_run_submission_handles_api_error(self, launcher, mock_config, gpu_type): + """Test that API errors are handled gracefully.""" + import requests + + with patch("libkernelbot.launchers.buildkite.requests.post") as mock_post: + mock_post.side_effect = requests.RequestException("API Error") + + reporter = MockProgressReporter() + result = await launcher.run_submission(mock_config, gpu_type, reporter) + + assert result.success is False + assert "API Error" in result.error + + @pytest.mark.asyncio + async def test_status_updates(self, launcher, mock_config, gpu_type): + """Test that status updates are sent correctly.""" + mock_response = MagicMock() + mock_response.json.return_value = { + "number": 456, + "web_url": "https://buildkite.com/test/builds/456", + "state": "scheduled", + } + mock_response.raise_for_status = MagicMock() + + with patch("libkernelbot.launchers.buildkite.requests.post", return_value=mock_response): + with patch.object(launcher, "_wait_for_completion", new_callable=AsyncMock): + with patch.object(launcher, "_download_and_parse_result", new_callable=AsyncMock) as mock_download: + mock_download.return_value = MagicMock(success=True) + + reporter = MockProgressReporter() + await launcher.run_submission(mock_config, gpu_type, reporter) + + # Check status messages were sent + assert any("456" in msg for msg in reporter.messages) + assert any("completed" in msg.lower() for msg in reporter.updates) + + +class TestBuildkiteRunner: + """Tests for buildkite-runner.py script.""" + + def test_runner_script_syntax(self): + """Test that runner script has valid Python syntax.""" + import py_compile + from pathlib import Path + + runner_path = Path(__file__).parent.parent / "src" / "runners" / "buildkite-runner.py" + # This will raise SyntaxError if invalid + py_compile.compile(str(runner_path), doraise=True)