gpu-mode · msaroufim · Jan 29, 2026 · Jan 29, 2026 · Jan 29, 2026 · Jan 29, 2026
diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
@@ -0,0 +1,50 @@
+# Buildkite Pipeline for Kernel Submissions
+#
+# This pipeline runs kernel submissions on GPU-bound Buildkite agents.
+# Each agent is configured with:
+#   - CUDA_VISIBLE_DEVICES bound to a single GPU
+#   - CPU/RAM limits via systemd cgroups
+#   - Queue tag for GPU routing (e.g., queue=nvidia-h100-0)
+#
+# Environment variables passed from BuildkiteLauncher:
+#   - SUBMISSION_PAYLOAD: Base64-encoded, zlib-compressed submission config
+#   - GPU_QUEUE: Queue name for agent routing
+#
+# Note: timeout_in_minutes is set high (60) as a safety net.
+# The BuildkiteLauncher handles dynamic timeouts based on submission mode
+# and will cancel jobs that exceed their configured timeout.
+
+steps:
+  - label: ":gpu: Run Kernel Submission"
+    command: "python /opt/kernelbot/buildkite-runner.py"
+    env:
+      # Payload is passed via BuildkiteLauncher
+      SUBMISSION_PAYLOAD: "${SUBMISSION_PAYLOAD}"
+    agents:
+      # Route to agent with matching queue tag
+      queue: "${GPU_QUEUE}"
+    # Safety timeout - BuildkiteLauncher handles actual timeout enforcement
+    timeout_in_minutes: 60
+    artifact_paths:
+      - "result.json"
+      - "profile_data/**/*"
+    plugins:
+      - docker#v5.11.0:
+          image: "ghcr.io/gpu-mode/kernelbot-runner:latest"
+          always-pull: true
+          propagate-environment: true
+          # GPU access - agent already bound to single GPU via CUDA_VISIBLE_DEVICES
+          gpus: all
+          # Resource limits (can be overridden via env vars)
+          memory: "${MEMORY_LIMIT:-32g}"
+          cpus: "${CPU_LIMIT:-16}"
+          # Mount working directory for artifacts
+          volumes:
+            - ".:/workdir"
+          workdir: "/workdir"
+    retry:
+      automatic:
+        - exit_status: -1  # Agent lost connection
+          limit: 1
+        - exit_status: 255  # SSH error
+          limit: 1
diff --git a/.github/workflows/build-runner-image.yml b/.github/workflows/build-runner-image.yml
@@ -0,0 +1,107 @@
+name: Build Runner Image
+
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - 'docker/kernelbot-runner/**'
+      - 'src/libkernelbot/**'
+      - 'src/runners/buildkite-runner.py'
+      - '.github/workflows/build-runner-image.yml'
+  pull_request:
+    paths:
+      - 'docker/kernelbot-runner/**'
+      - 'src/libkernelbot/**'
+      - 'src/runners/buildkite-runner.py'
+  workflow_dispatch:
+    inputs:
+      push:
+        description: 'Push image to registry'
+        required: false
+        default: 'true'
+        type: boolean
+  schedule:
+    # Rebuild weekly on Sundays at 2 AM UTC
+    - cron: '0 2 * * 0'
+
+env:
+  REGISTRY: ghcr.io
+  IMAGE_NAME: gpu-mode/kernelbot-runner
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      packages: write
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Log in to Container Registry
+        if: github.event_name != 'pull_request'
+        uses: docker/login-action@v3
+        with:
+          registry: ${{ env.REGISTRY }}
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Extract metadata for Docker
+        id: meta
+        uses: docker/metadata-action@v5
+        with:
+          images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
+          tags: |
+            type=raw,value=latest,enable={{is_default_branch}}
+            type=sha,prefix=sha-
+            type=ref,event=branch
+            type=ref,event=pr
+
+      - name: Build and push Docker image
+        uses: docker/build-push-action@v5
+        with:
+          context: .
+          file: docker/kernelbot-runner/Dockerfile
+          push: ${{ github.event_name != 'pull_request' && (github.event.inputs.push != 'false') }}
+          tags: ${{ steps.meta.outputs.tags }}
+          labels: ${{ steps.meta.outputs.labels }}
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
+
+      - name: Generate build summary
+        run: |
+          echo "## Docker Image Build Summary" >> $GITHUB_STEP_SUMMARY
+          echo "" >> $GITHUB_STEP_SUMMARY
+          echo "**Image:** \`${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}\`" >> $GITHUB_STEP_SUMMARY
+          echo "" >> $GITHUB_STEP_SUMMARY
+          echo "**Tags:**" >> $GITHUB_STEP_SUMMARY
+          echo '```' >> $GITHUB_STEP_SUMMARY
+          echo "${{ steps.meta.outputs.tags }}" >> $GITHUB_STEP_SUMMARY
+          echo '```' >> $GITHUB_STEP_SUMMARY
+
+      - name: Notify vendors (Slack)
+        if: github.event_name == 'push' && github.ref == 'refs/heads/main'
+        continue-on-error: true
+        uses: slackapi/slack-github-action@v1.25.0
+        with:
+          payload: |
+            {
+              "text": "New kernelbot-runner image published",
+              "blocks": [
+                {
+                  "type": "section",
+                  "text": {
+                    "type": "mrkdwn",
+                    "text": "*New kernelbot-runner image published* :package:\n\nVendors: run `./scripts/buildkite/update-image.sh` to update your agents.\n\n<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|View build>"
+                  }
+                }
+              ]
+            }
+        env:
+          SLACK_WEBHOOK_URL: ${{ secrets.SLACK_VENDOR_WEBHOOK }}
+          SLACK_WEBHOOK_TYPE: INCOMING_WEBHOOK
diff --git a/.gitignore b/.gitignore
@@ -11,3 +11,5 @@ reference-kernels/
 yoyo.ini
 .venv
 .claude/
+*.egg
+*.egg-info/
diff --git a/docker/kernelbot-runner/Dockerfile b/docker/kernelbot-runner/Dockerfile
@@ -0,0 +1,67 @@
+# Kernelbot Runner Docker Image
+#
+# This image is used by Buildkite agents to run kernel submissions.
+# It matches the Modal runner configuration for consistent behavior.
+#
+# Build:
+#   docker build -t ghcr.io/gpu-mode/kernelbot-runner:latest -f docker/kernelbot-runner/Dockerfile .
+#
+# Run locally (for testing):
+#   docker run --gpus '"device=0"' -e SUBMISSION_PAYLOAD="..." kernelbot-runner:latest
+
+FROM nvidia/cuda:13.1.0-devel-ubuntu24.04
+
+LABEL org.opencontainers.image.source="https://github.com/gpu-mode/kernelbot"
+LABEL org.opencontainers.image.description="Kernelbot GPU runner for kernel competitions"
+
+# Install system dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    python3.13 \
+    python3.13-venv \
+    python3-pip \
+    git \
+    gcc-13 \
+    g++-13 \
+    clang-18 \
+    curl \
+    && rm -rf /var/lib/apt/lists/* \
+    && ln -sf /usr/bin/python3.13 /usr/bin/python3 \
+    && ln -sf /usr/bin/python3.13 /usr/bin/python
+
+# Create virtual environment
+RUN python3 -m venv /opt/venv
+ENV PATH="/opt/venv/bin:$PATH"
+
+# Install Python dependencies (matching modal_runner.py)
+COPY docker/kernelbot-runner/requirements-runner.txt /tmp/
+RUN pip install --upgrade pip && \
+    pip install -r /tmp/requirements-runner.txt
+
+# Install PyTorch with CUDA 13.0 support
+RUN pip install \
+    torch==2.9.1 \
+    torchvision \
+    torchaudio \
+    --index-url https://download.pytorch.org/whl/cu130
+
+# Install additional frameworks
+RUN pip install \
+    tinygrad~=0.10
+
+# Install NVIDIA CUDA packages
+RUN pip install \
+    nvidia-cupynumeric~=25.3 \
+    nvidia-cutlass-dsl==4.3.5 \
+    "cuda-core[cu13]" \
+    "cuda-python[all]==13.0"
+
+# Copy kernelbot library and runner
+WORKDIR /opt/kernelbot
+COPY src/libkernelbot /opt/kernelbot/libkernelbot
+COPY src/runners/buildkite-runner.py /opt/kernelbot/
+
+# Set PYTHONPATH so libkernelbot is importable
+ENV PYTHONPATH="/opt/kernelbot:$PYTHONPATH"
+
+# Default command
+CMD ["python", "/opt/kernelbot/buildkite-runner.py"]
diff --git a/docker/kernelbot-runner/requirements-runner.txt b/docker/kernelbot-runner/requirements-runner.txt
@@ -0,0 +1,17 @@
+# Kernelbot Runner Dependencies
+# These should match the Modal runner configuration in modal_runner.py
+
+# Build tools
+ninja~=1.11
+wheel~=0.45
+setuptools
+
+# Core dependencies
+requests~=2.32.4
+packaging~=25.0
+numpy~=2.3
+pytest
+PyYAML
+
+# Triton for GPU kernels
+triton