diff --git a/tests/integration-tests/tests/basic/test_essential_features.py b/tests/integration-tests/tests/basic/test_essential_features.py index deea90c56e..c260abfe76 100644 --- a/tests/integration-tests/tests/basic/test_essential_features.py +++ b/tests/integration-tests/tests/basic/test_essential_features.py @@ -91,6 +91,8 @@ def test_essential_features( cluster, region, instance, scheduler, default_threads_per_core, request, scheduler_commands_factory ) + _test_gpu_workload(cluster, scheduler_commands_factory, test_datadir) + def _test_mpi_job( scheduler, region, instance, cluster, test_datadir, scheduler_commands_factory, scaledown_idletime, max_queue_size @@ -331,6 +333,29 @@ def _test_custom_bootstrap_scripts_args_quotes(cluster): ) +def _test_gpu_workload(cluster, scheduler_commands_factory, test_datadir): + """Submit a Slurm job that builds and runs CUDA samples on a GPU compute node.""" + remote_command_executor = RemoteCommandExecutor(cluster) + scheduler_commands = scheduler_commands_factory(remote_command_executor) + + samples = ["1_Utilities/deviceQuery", "4_CUDA_Libraries/matrixMulCUBLAS"] + job_ids = [] + for sample in samples: + logging.info("Submitting CUDA sample job for %s", sample) + result = scheduler_commands.submit_script( + str(test_datadir / "gpu_job.sh"), + script_args=[sample], + partition="gpu", + nodes=1, + slots=1, + ) + job_ids.append(scheduler_commands.assert_job_submitted(result.stdout)) + + for job_id in job_ids: + scheduler_commands.wait_job_completed(job_id, timeout=20) + scheduler_commands.assert_job_succeeded(job_id) + + def _test_disable_hyperthreading( cluster, region, instance, scheduler, default_threads_per_core, request, scheduler_commands_factory ): diff --git a/tests/integration-tests/tests/basic/test_essential_features/test_essential_features/gpu_job.sh b/tests/integration-tests/tests/basic/test_essential_features/test_essential_features/gpu_job.sh new file mode 100644 index 0000000000..4e1459bd70 --- /dev/null +++ b/tests/integration-tests/tests/basic/test_essential_features/test_essential_features/gpu_job.sh @@ -0,0 +1,51 @@ +#!/bin/bash +#SBATCH --job-name=cuda-gpu-validate +#SBATCH --output=cuda-gpu-validate-%j.out + +# Build and run a single CUDA sample (passed as a script argument) from the +# pre-installed /usr/local/cuda-samples-13.0 tree. CUDA samples 13.x are +# CMake-only and /usr/local/... isn't writable, so the script copies the +# sample into a temp dir before building. + +set -euo pipefail + +if [[ $# -ne 1 ]]; then + echo "Usage: sbatch $0 /" >&2 + echo " e.g. sbatch $0 1_Utilities/deviceQuery" >&2 + exit 2 +fi +SAMPLE_REL=$1 +SAMPLE_NAME=${SAMPLE_REL##*/} + +export PATH=/usr/local/cuda/bin:${PATH} + +echo "Node: $(hostname)" +echo "Sample: $SAMPLE_REL" +echo "SLURM_JOB_GPUS=${SLURM_JOB_GPUS:-unset}" +echo "CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-unset}" +nvidia-smi -L +nvidia-smi +nvcc --version + +SAMPLES_SRC=/usr/local/cuda-samples-13.0 +if [[ ! -d "$SAMPLES_SRC/Samples/$SAMPLE_REL" ]]; then + echo "ERROR: sample not found: $SAMPLES_SRC/Samples/$SAMPLE_REL" >&2 + exit 2 +fi + +WORKDIR=$(mktemp -d) +trap 'rm -rf "$WORKDIR"' EXIT + +# Shared scaffolding required by every sample (Common/, top-level cmake/) +cp -r "$SAMPLES_SRC"/{Common,cmake,CMakeLists.txt} "$WORKDIR"/ + +DST="$WORKDIR/Samples/$SAMPLE_REL" +mkdir -p "$(dirname "$DST")" +cp -r "$SAMPLES_SRC/Samples/$SAMPLE_REL" "$DST" + +echo "===== Building $SAMPLE_REL =====" +cmake -S "$DST" -B "$DST/build" +cmake --build "$DST/build" -j"${SLURM_CPUS_PER_TASK:-2}" + +echo "===== Running $SAMPLE_NAME =====" +"$DST/build/$SAMPLE_NAME" diff --git a/tests/integration-tests/tests/basic/test_essential_features/test_essential_features/pcluster.config.yaml b/tests/integration-tests/tests/basic/test_essential_features/test_essential_features/pcluster.config.yaml index 11305366e3..4d93d06326 100644 --- a/tests/integration-tests/tests/basic/test_essential_features/test_essential_features/pcluster.config.yaml +++ b/tests/integration-tests/tests/basic/test_essential_features/test_essential_features/pcluster.config.yaml @@ -121,6 +121,16 @@ Scheduling: - InstanceType: {{ instance }} MinCount: 1 MaxCount: {{ max_queue_size }} + - Name: gpu + Networking: + SubnetIds: + - {{ private_subnet_id }} + ComputeResources: + - Name: cr1 + Instances: + - InstanceType: g4dn.2xlarge + MinCount: 0 + MaxCount: 1 SlurmSettings: ScaledownIdletime: {{ scaledown_idletime }} # Use shorter scaledown time to test logs in slurm_suspend Monitoring: