Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 50 additions & 0 deletions .buildkite/pipeline.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# Buildkite Pipeline for Kernel Submissions
#
# This pipeline runs kernel submissions on GPU-bound Buildkite agents.
# Each agent is configured with:
# - CUDA_VISIBLE_DEVICES bound to a single GPU
# - CPU/RAM limits via systemd cgroups
# - Queue tag for GPU routing (e.g., queue=nvidia-h100-0)
#
# Environment variables passed from BuildkiteLauncher:
# - SUBMISSION_PAYLOAD: Base64-encoded, zlib-compressed submission config
# - GPU_QUEUE: Queue name for agent routing
#
# Note: timeout_in_minutes is set high (60) as a safety net.
# The BuildkiteLauncher handles dynamic timeouts based on submission mode
# and will cancel jobs that exceed their configured timeout.

steps:
- label: ":gpu: Run Kernel Submission"
command: "python /opt/kernelbot/buildkite-runner.py"
env:
# Payload is passed via BuildkiteLauncher
SUBMISSION_PAYLOAD: "${SUBMISSION_PAYLOAD}"
agents:
# Route to agent with matching queue tag
queue: "${GPU_QUEUE}"
# Safety timeout - BuildkiteLauncher handles actual timeout enforcement
timeout_in_minutes: 60
artifact_paths:
- "result.json"
- "profile_data/**/*"
plugins:
- docker#v5.11.0:
image: "ghcr.io/gpu-mode/kernelbot-runner:latest"
always-pull: true
propagate-environment: true
# GPU access - agent already bound to single GPU via CUDA_VISIBLE_DEVICES
gpus: all
# Resource limits (can be overridden via env vars)
memory: "${MEMORY_LIMIT:-32g}"
cpus: "${CPU_LIMIT:-16}"
# Mount working directory for artifacts
volumes:
- ".:/workdir"
workdir: "/workdir"
retry:
automatic:
- exit_status: -1 # Agent lost connection
limit: 1
- exit_status: 255 # SSH error
limit: 1
107 changes: 107 additions & 0 deletions .github/workflows/build-runner-image.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
name: Build Runner Image

on:
push:
branches:
- main
paths:
- 'docker/kernelbot-runner/**'
- 'src/libkernelbot/**'
- 'src/runners/buildkite-runner.py'
- '.github/workflows/build-runner-image.yml'
pull_request:
paths:
- 'docker/kernelbot-runner/**'
- 'src/libkernelbot/**'
- 'src/runners/buildkite-runner.py'
workflow_dispatch:
inputs:
push:
description: 'Push image to registry'
required: false
default: 'true'
type: boolean
schedule:
# Rebuild weekly on Sundays at 2 AM UTC
- cron: '0 2 * * 0'

env:
REGISTRY: ghcr.io
IMAGE_NAME: gpu-mode/kernelbot-runner

jobs:
build:
runs-on: ubuntu-latest
permissions:
contents: read
packages: write

steps:
- name: Checkout repository
uses: actions/checkout@v4

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3

- name: Log in to Container Registry
if: github.event_name != 'pull_request'
uses: docker/login-action@v3
with:
registry: ${{ env.REGISTRY }}
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Extract metadata for Docker
id: meta
uses: docker/metadata-action@v5
with:
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
tags: |
type=raw,value=latest,enable={{is_default_branch}}
type=sha,prefix=sha-
type=ref,event=branch
type=ref,event=pr

- name: Build and push Docker image
uses: docker/build-push-action@v5
with:
context: .
file: docker/kernelbot-runner/Dockerfile
push: ${{ github.event_name != 'pull_request' && (github.event.inputs.push != 'false') }}
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
cache-from: type=gha
cache-to: type=gha,mode=max

- name: Generate build summary
run: |
echo "## Docker Image Build Summary" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "**Image:** \`${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}\`" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "**Tags:**" >> $GITHUB_STEP_SUMMARY
echo '```' >> $GITHUB_STEP_SUMMARY
echo "${{ steps.meta.outputs.tags }}" >> $GITHUB_STEP_SUMMARY
echo '```' >> $GITHUB_STEP_SUMMARY

- name: Notify vendors (Slack)
if: github.event_name == 'push' && github.ref == 'refs/heads/main'
continue-on-error: true
uses: slackapi/slack-github-action@v1.25.0
with:
payload: |
{
"text": "New kernelbot-runner image published",
"blocks": [
{
"type": "section",
"text": {
"type": "mrkdwn",
"text": "*New kernelbot-runner image published* :package:\n\nVendors: run `./scripts/buildkite/update-image.sh` to update your agents.\n\n<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|View build>"
}
}
]
}
env:
SLACK_WEBHOOK_URL: ${{ secrets.SLACK_VENDOR_WEBHOOK }}
SLACK_WEBHOOK_TYPE: INCOMING_WEBHOOK
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,5 @@ reference-kernels/
yoyo.ini
.venv
.claude/
*.egg
*.egg-info/
67 changes: 67 additions & 0 deletions docker/kernelbot-runner/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
# Kernelbot Runner Docker Image
#
# This image is used by Buildkite agents to run kernel submissions.
# It matches the Modal runner configuration for consistent behavior.
#
# Build:
# docker build -t ghcr.io/gpu-mode/kernelbot-runner:latest -f docker/kernelbot-runner/Dockerfile .
#
# Run locally (for testing):
# docker run --gpus '"device=0"' -e SUBMISSION_PAYLOAD="..." kernelbot-runner:latest

FROM nvidia/cuda:13.1.0-devel-ubuntu24.04

LABEL org.opencontainers.image.source="https://github.com/gpu-mode/kernelbot"
LABEL org.opencontainers.image.description="Kernelbot GPU runner for kernel competitions"

# Install system dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
python3.13 \
python3.13-venv \
python3-pip \
git \
gcc-13 \
g++-13 \
clang-18 \
curl \
&& rm -rf /var/lib/apt/lists/* \
&& ln -sf /usr/bin/python3.13 /usr/bin/python3 \
&& ln -sf /usr/bin/python3.13 /usr/bin/python

# Create virtual environment
RUN python3 -m venv /opt/venv
ENV PATH="/opt/venv/bin:$PATH"

# Install Python dependencies (matching modal_runner.py)
COPY docker/kernelbot-runner/requirements-runner.txt /tmp/
RUN pip install --upgrade pip && \
pip install -r /tmp/requirements-runner.txt

# Install PyTorch with CUDA 13.0 support
RUN pip install \
torch==2.9.1 \
torchvision \
torchaudio \
--index-url https://download.pytorch.org/whl/cu130

# Install additional frameworks
RUN pip install \
tinygrad~=0.10

# Install NVIDIA CUDA packages
RUN pip install \
nvidia-cupynumeric~=25.3 \
nvidia-cutlass-dsl==4.3.5 \
"cuda-core[cu13]" \
"cuda-python[all]==13.0"

# Copy kernelbot library and runner
WORKDIR /opt/kernelbot
COPY src/libkernelbot /opt/kernelbot/libkernelbot
COPY src/runners/buildkite-runner.py /opt/kernelbot/

# Set PYTHONPATH so libkernelbot is importable
ENV PYTHONPATH="/opt/kernelbot:$PYTHONPATH"

# Default command
CMD ["python", "/opt/kernelbot/buildkite-runner.py"]
17 changes: 17 additions & 0 deletions docker/kernelbot-runner/requirements-runner.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Kernelbot Runner Dependencies
# These should match the Modal runner configuration in modal_runner.py

# Build tools
ninja~=1.11
wheel~=0.45
setuptools

# Core dependencies
requests~=2.32.4
packaging~=25.0
numpy~=2.3
pytest
PyYAML

# Triton for GPU kernels
triton
Loading
Loading