Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions tests/integration-tests/tests/basic/test_essential_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,8 @@ def test_essential_features(
cluster, region, instance, scheduler, default_threads_per_core, request, scheduler_commands_factory
)

_test_gpu_workload(cluster, scheduler_commands_factory, test_datadir)


def _test_mpi_job(
scheduler, region, instance, cluster, test_datadir, scheduler_commands_factory, scaledown_idletime, max_queue_size
Expand Down Expand Up @@ -331,6 +333,29 @@ def _test_custom_bootstrap_scripts_args_quotes(cluster):
)


def _test_gpu_workload(cluster, scheduler_commands_factory, test_datadir):
"""Submit a Slurm job that builds and runs CUDA samples on a GPU compute node."""
remote_command_executor = RemoteCommandExecutor(cluster)
scheduler_commands = scheduler_commands_factory(remote_command_executor)

samples = ["1_Utilities/deviceQuery", "4_CUDA_Libraries/matrixMulCUBLAS"]
job_ids = []
for sample in samples:
logging.info("Submitting CUDA sample job for %s", sample)
result = scheduler_commands.submit_script(
str(test_datadir / "gpu_job.sh"),
script_args=[sample],
partition="gpu",
nodes=1,
slots=1,
)
job_ids.append(scheduler_commands.assert_job_submitted(result.stdout))

for job_id in job_ids:
scheduler_commands.wait_job_completed(job_id, timeout=20)
scheduler_commands.assert_job_succeeded(job_id)


def _test_disable_hyperthreading(
cluster, region, instance, scheduler, default_threads_per_core, request, scheduler_commands_factory
):
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#!/bin/bash
#SBATCH --job-name=cuda-gpu-validate
#SBATCH --output=cuda-gpu-validate-%j.out

# Build and run a single CUDA sample (passed as a script argument) from the
# pre-installed /usr/local/cuda-samples-13.0 tree. CUDA samples 13.x are
# CMake-only and /usr/local/... isn't writable, so the script copies the
# sample into a temp dir before building.

set -euo pipefail

if [[ $# -ne 1 ]]; then
echo "Usage: sbatch $0 <category>/<sample>" >&2
echo " e.g. sbatch $0 1_Utilities/deviceQuery" >&2
exit 2
fi
SAMPLE_REL=$1
SAMPLE_NAME=${SAMPLE_REL##*/}

export PATH=/usr/local/cuda/bin:${PATH}

echo "Node: $(hostname)"
echo "Sample: $SAMPLE_REL"
echo "SLURM_JOB_GPUS=${SLURM_JOB_GPUS:-unset}"
echo "CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-unset}"
nvidia-smi -L
nvidia-smi
nvcc --version

SAMPLES_SRC=/usr/local/cuda-samples-13.0
if [[ ! -d "$SAMPLES_SRC/Samples/$SAMPLE_REL" ]]; then
echo "ERROR: sample not found: $SAMPLES_SRC/Samples/$SAMPLE_REL" >&2
exit 2
fi

WORKDIR=$(mktemp -d)
trap 'rm -rf "$WORKDIR"' EXIT

# Shared scaffolding required by every sample (Common/, top-level cmake/)
cp -r "$SAMPLES_SRC"/{Common,cmake,CMakeLists.txt} "$WORKDIR"/

DST="$WORKDIR/Samples/$SAMPLE_REL"
mkdir -p "$(dirname "$DST")"
cp -r "$SAMPLES_SRC/Samples/$SAMPLE_REL" "$DST"

echo "===== Building $SAMPLE_REL ====="
cmake -S "$DST" -B "$DST/build"
cmake --build "$DST/build" -j"${SLURM_CPUS_PER_TASK:-2}"

echo "===== Running $SAMPLE_NAME ====="
"$DST/build/$SAMPLE_NAME"
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,16 @@ Scheduling:
- InstanceType: {{ instance }}
MinCount: 1
MaxCount: {{ max_queue_size }}
- Name: gpu
Networking:
SubnetIds:
- {{ private_subnet_id }}
ComputeResources:
- Name: cr1
Instances:
- InstanceType: g4dn.2xlarge
MinCount: 0
MaxCount: 1
SlurmSettings:
ScaledownIdletime: {{ scaledown_idletime }} # Use shorter scaledown time to test logs in slurm_suspend
Monitoring:
Expand Down
Loading