Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .github/copy-pr-bot.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
enabled: true
Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note: this is a config for the copy-pr-bot, which is an extra security measure for running tests when the code may be coming from an external fork (see https://docs.gha-runners.nvidia.com/platform/apps/copy-pr-bot/).

However, the mechanism applies to regular branches as well, but the process is automatic, as long as the commit is signed.

auto_sync_draft: false
auto_sync_ready: true
82 changes: 82 additions & 0 deletions .github/workflows/e2e-gpu-test.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
name: GPU E2E Test

on:
workflow_call:
inputs:
image-tag:
description: "Image tag to test (typically the commit SHA)"
required: true
type: string

permissions:
contents: read
packages: read

jobs:
e2e-gpu:
name: "E2E GPU (${{ matrix.name }})"
runs-on: ${{ matrix.runner }}
continue-on-error: ${{ matrix.experimental }}
timeout-minutes: 30
strategy:
fail-fast: false
matrix:
include:
- name: linux-arm64
runner: linux-arm64-gpu-l4-latest-1
cluster: e2e-gpu-arm64
port: "8083"
experimental: false
- name: linux-amd64
runner: linux-amd64-gpu-rtxpro6000-latest-1
cluster: e2e-gpu-amd64
port: "8084"
experimental: false
- name: wsl-amd64
runner: wsl-amd64-gpu-rtxpro6000-latest-1
cluster: e2e-gpu-wsl
port: "8085"
experimental: true
container:
image: ghcr.io/nvidia/openshell/ci:latest
credentials:
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
options: --privileged
volumes:
- /var/run/docker.sock:/var/run/docker.sock
env:
MISE_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
IMAGE_TAG: ${{ inputs.image-tag }}
OPENSHELL_REGISTRY: ghcr.io/nvidia/openshell
OPENSHELL_REGISTRY_HOST: ghcr.io
OPENSHELL_REGISTRY_NAMESPACE: nvidia/openshell
OPENSHELL_REGISTRY_USERNAME: ${{ github.actor }}
OPENSHELL_REGISTRY_PASSWORD: ${{ secrets.GITHUB_TOKEN }}
OPENSHELL_GATEWAY: ${{ matrix.cluster }}
steps:
- uses: actions/checkout@v4

- name: Log in to GHCR
run: echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io -u "${{ github.actor }}" --password-stdin

- name: Pull cluster image
run: docker pull ghcr.io/nvidia/openshell/cluster:${{ inputs.image-tag }}

- name: Install Python dependencies and generate protobuf stubs
run: uv sync --frozen && mise run --no-prepare python:proto

- name: Bootstrap GPU cluster
env:
GATEWAY_HOST: host.docker.internal
GATEWAY_PORT: ${{ matrix.port }}
CLUSTER_NAME: ${{ matrix.cluster }}
# Passes --gpu to the gateway bootstrap so the cluster comes up with GPU passthrough enabled.
CLUSTER_GPU: "1"
SKIP_IMAGE_PUSH: "1"
SKIP_CLUSTER_IMAGE_BUILD: "1"
OPENSHELL_CLUSTER_IMAGE: ghcr.io/nvidia/openshell/cluster:${{ inputs.image-tag }}
run: mise run --no-prepare --skip-deps cluster

- name: Run tests
run: mise run --no-prepare --skip-deps e2e:python:gpu
77 changes: 77 additions & 0 deletions .github/workflows/test-gpu.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
name: GPU Test

on:
push:
branches:
- "pull-request/[0-9]+"
workflow_dispatch: {}
# Add `schedule:` here when we want nightly coverage from the same workflow.

permissions:
contents: read
pull-requests: read
packages: write

jobs:
pr_metadata:
name: Resolve PR metadata
runs-on: ubuntu-latest
outputs:
should_run: ${{ steps.gate.outputs.should_run }}
steps:
- id: get_pr_info
if: github.event_name == 'push'
continue-on-error: true
uses: nv-gha-runners/get-pr-info@main

- id: gate
shell: bash
env:
EVENT_NAME: ${{ github.event_name }}
GITHUB_SHA_VALUE: ${{ github.sha }}
GET_PR_INFO_OUTCOME: ${{ steps.get_pr_info.outcome }}
PR_INFO: ${{ steps.get_pr_info.outputs.pr-info }}
run: |
if [ "$EVENT_NAME" != "push" ]; then
echo "should_run=true" >> "$GITHUB_OUTPUT"
exit 0
fi
if [ "$GET_PR_INFO_OUTCOME" != "success" ]; then
echo "should_run=false" >> "$GITHUB_OUTPUT"
exit 0
fi
head_sha="$(jq -r '.head.sha' <<< "$PR_INFO")"
has_gpu_label="$(jq -r '[.labels[].name] | index("test:e2e-gpu") != null' <<< "$PR_INFO")"
# Only trust copied pull-request/* pushes that still match the PR head SHA
# and are explicitly labeled for GPU coverage.
if [ "$head_sha" = "$GITHUB_SHA_VALUE" ] && [ "$has_gpu_label" = "true" ]; then
should_run=true
else
should_run=false
fi
echo "should_run=$should_run" >> "$GITHUB_OUTPUT"
build-gateway:
needs: [pr_metadata]
if: needs.pr_metadata.outputs.should_run == 'true'
uses: ./.github/workflows/docker-build.yml
with:
component: gateway

build-cluster:
needs: [pr_metadata]
if: needs.pr_metadata.outputs.should_run == 'true'
uses: ./.github/workflows/docker-build.yml
with:
component: cluster

e2e-gpu:
needs: [pr_metadata, build-gateway, build-cluster]
if: needs.pr_metadata.outputs.should_run == 'true'
uses: ./.github/workflows/e2e-gpu-test.yaml
with:
image-tag: ${{ github.sha }}
Loading