aws · gmarciani · May 19, 2026
@@ -91,6 +91,8 @@ def test_essential_features(
         cluster, region, instance, scheduler, default_threads_per_core, request, scheduler_commands_factory
     )
 
+    _test_gpu_workload(cluster, scheduler_commands_factory, test_datadir)
+
 
 def _test_mpi_job(
     scheduler, region, instance, cluster, test_datadir, scheduler_commands_factory, scaledown_idletime, max_queue_size
@@ -331,6 +333,29 @@ def _test_custom_bootstrap_scripts_args_quotes(cluster):
     )
 
 
+def _test_gpu_workload(cluster, scheduler_commands_factory, test_datadir):
+    """Submit a Slurm job that builds and runs CUDA samples on a GPU compute node."""
+    remote_command_executor = RemoteCommandExecutor(cluster)
+    scheduler_commands = scheduler_commands_factory(remote_command_executor)
+
+    samples = ["1_Utilities/deviceQuery", "4_CUDA_Libraries/matrixMulCUBLAS"]
+    job_ids = []
+    for sample in samples:
+        logging.info("Submitting CUDA sample job for %s", sample)
+        result = scheduler_commands.submit_script(
+            str(test_datadir / "gpu_job.sh"),
+            script_args=[sample],
+            partition="gpu",
+            nodes=1,
+            slots=1,
+        )
+        job_ids.append(scheduler_commands.assert_job_submitted(result.stdout))
+
+    for job_id in job_ids:
+        scheduler_commands.wait_job_completed(job_id, timeout=20)
+        scheduler_commands.assert_job_succeeded(job_id)
+
+
 def _test_disable_hyperthreading(
     cluster, region, instance, scheduler, default_threads_per_core, request, scheduler_commands_factory
 ):

@@ -0,0 +1,51 @@
+#!/bin/bash
+#SBATCH --job-name=cuda-gpu-validate
+#SBATCH --output=cuda-gpu-validate-%j.out
+
+# Build and run a single CUDA sample (passed as a script argument) from the
+# pre-installed /usr/local/cuda-samples-13.0 tree. CUDA samples 13.x are
+# CMake-only and /usr/local/... isn't writable, so the script copies the
+# sample into a temp dir before building.
+
+set -euo pipefail
+
+if [[ $# -ne 1 ]]; then
+    echo "Usage: sbatch $0 <category>/<sample>" >&2
+    echo "  e.g. sbatch $0 1_Utilities/deviceQuery"  >&2
+    exit 2
+fi
+SAMPLE_REL=$1
+SAMPLE_NAME=${SAMPLE_REL##*/}
+
+export PATH=/usr/local/cuda/bin:${PATH}
+
+echo "Node: $(hostname)"
+echo "Sample: $SAMPLE_REL"
+echo "SLURM_JOB_GPUS=${SLURM_JOB_GPUS:-unset}"
+echo "CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-unset}"
+nvidia-smi -L
+nvidia-smi
+nvcc --version
+
+SAMPLES_SRC=/usr/local/cuda-samples-13.0
+if [[ ! -d "$SAMPLES_SRC/Samples/$SAMPLE_REL" ]]; then
+    echo "ERROR: sample not found: $SAMPLES_SRC/Samples/$SAMPLE_REL" >&2
+    exit 2
+fi
+
+WORKDIR=$(mktemp -d)
+trap 'rm -rf "$WORKDIR"' EXIT
+
+# Shared scaffolding required by every sample (Common/, top-level cmake/)
+cp -r "$SAMPLES_SRC"/{Common,cmake,CMakeLists.txt} "$WORKDIR"/
+
+DST="$WORKDIR/Samples/$SAMPLE_REL"
+mkdir -p "$(dirname "$DST")"
+cp -r "$SAMPLES_SRC/Samples/$SAMPLE_REL" "$DST"
+
+echo "===== Building $SAMPLE_REL ====="
+cmake -S "$DST" -B "$DST/build"
+cmake --build "$DST/build" -j"${SLURM_CPUS_PER_TASK:-2}"
+
+echo "===== Running $SAMPLE_NAME ====="
+"$DST/build/$SAMPLE_NAME"
@@ -121,6 +121,16 @@ Scheduling:
             - InstanceType: {{ instance }}
           MinCount: 1
           MaxCount: {{ max_queue_size }}
+    - Name: gpu
+      Networking:
+        SubnetIds:
+          - {{ private_subnet_id }}
+      ComputeResources:
+        - Name: cr1
+          Instances:
+            - InstanceType: g4dn.2xlarge
+          MinCount: 0
+          MaxCount: 1
   SlurmSettings:
     ScaledownIdletime: {{ scaledown_idletime }} # Use shorter scaledown time to test logs in slurm_suspend
 Monitoring: