From 28744c3d50d82cad5af7cc717d367c6917dd2ec7 Mon Sep 17 00:00:00 2001 From: Giacomo Marciani Date: Tue, 19 May 2026 18:27:10 +0200 Subject: [PATCH 1/2] [Test] Extend test_essential_feature to also cover basic GPU workload leveraging CUDA Samples. --- .../tests/basic/test_essential_features.py | 25 +++++++++ .../test_essential_features/gpu_job.sh | 51 +++++++++++++++++++ .../pcluster.config.yaml | 10 ++++ 3 files changed, 86 insertions(+) create mode 100644 tests/integration-tests/tests/basic/test_essential_features/test_essential_features/gpu_job.sh diff --git a/tests/integration-tests/tests/basic/test_essential_features.py b/tests/integration-tests/tests/basic/test_essential_features.py index deea90c56e..c260abfe76 100644 --- a/tests/integration-tests/tests/basic/test_essential_features.py +++ b/tests/integration-tests/tests/basic/test_essential_features.py @@ -91,6 +91,8 @@ def test_essential_features( cluster, region, instance, scheduler, default_threads_per_core, request, scheduler_commands_factory ) + _test_gpu_workload(cluster, scheduler_commands_factory, test_datadir) + def _test_mpi_job( scheduler, region, instance, cluster, test_datadir, scheduler_commands_factory, scaledown_idletime, max_queue_size @@ -331,6 +333,29 @@ def _test_custom_bootstrap_scripts_args_quotes(cluster): ) +def _test_gpu_workload(cluster, scheduler_commands_factory, test_datadir): + """Submit a Slurm job that builds and runs CUDA samples on a GPU compute node.""" + remote_command_executor = RemoteCommandExecutor(cluster) + scheduler_commands = scheduler_commands_factory(remote_command_executor) + + samples = ["1_Utilities/deviceQuery", "4_CUDA_Libraries/matrixMulCUBLAS"] + job_ids = [] + for sample in samples: + logging.info("Submitting CUDA sample job for %s", sample) + result = scheduler_commands.submit_script( + str(test_datadir / "gpu_job.sh"), + script_args=[sample], + partition="gpu", + nodes=1, + slots=1, + ) + job_ids.append(scheduler_commands.assert_job_submitted(result.stdout)) + + for job_id in job_ids: + scheduler_commands.wait_job_completed(job_id, timeout=20) + scheduler_commands.assert_job_succeeded(job_id) + + def _test_disable_hyperthreading( cluster, region, instance, scheduler, default_threads_per_core, request, scheduler_commands_factory ): diff --git a/tests/integration-tests/tests/basic/test_essential_features/test_essential_features/gpu_job.sh b/tests/integration-tests/tests/basic/test_essential_features/test_essential_features/gpu_job.sh new file mode 100644 index 0000000000..4e1459bd70 --- /dev/null +++ b/tests/integration-tests/tests/basic/test_essential_features/test_essential_features/gpu_job.sh @@ -0,0 +1,51 @@ +#!/bin/bash +#SBATCH --job-name=cuda-gpu-validate +#SBATCH --output=cuda-gpu-validate-%j.out + +# Build and run a single CUDA sample (passed as a script argument) from the +# pre-installed /usr/local/cuda-samples-13.0 tree. CUDA samples 13.x are +# CMake-only and /usr/local/... isn't writable, so the script copies the +# sample into a temp dir before building. + +set -euo pipefail + +if [[ $# -ne 1 ]]; then + echo "Usage: sbatch $0 /" >&2 + echo " e.g. sbatch $0 1_Utilities/deviceQuery" >&2 + exit 2 +fi +SAMPLE_REL=$1 +SAMPLE_NAME=${SAMPLE_REL##*/} + +export PATH=/usr/local/cuda/bin:${PATH} + +echo "Node: $(hostname)" +echo "Sample: $SAMPLE_REL" +echo "SLURM_JOB_GPUS=${SLURM_JOB_GPUS:-unset}" +echo "CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-unset}" +nvidia-smi -L +nvidia-smi +nvcc --version + +SAMPLES_SRC=/usr/local/cuda-samples-13.0 +if [[ ! -d "$SAMPLES_SRC/Samples/$SAMPLE_REL" ]]; then + echo "ERROR: sample not found: $SAMPLES_SRC/Samples/$SAMPLE_REL" >&2 + exit 2 +fi + +WORKDIR=$(mktemp -d) +trap 'rm -rf "$WORKDIR"' EXIT + +# Shared scaffolding required by every sample (Common/, top-level cmake/) +cp -r "$SAMPLES_SRC"/{Common,cmake,CMakeLists.txt} "$WORKDIR"/ + +DST="$WORKDIR/Samples/$SAMPLE_REL" +mkdir -p "$(dirname "$DST")" +cp -r "$SAMPLES_SRC/Samples/$SAMPLE_REL" "$DST" + +echo "===== Building $SAMPLE_REL =====" +cmake -S "$DST" -B "$DST/build" +cmake --build "$DST/build" -j"${SLURM_CPUS_PER_TASK:-2}" + +echo "===== Running $SAMPLE_NAME =====" +"$DST/build/$SAMPLE_NAME" diff --git a/tests/integration-tests/tests/basic/test_essential_features/test_essential_features/pcluster.config.yaml b/tests/integration-tests/tests/basic/test_essential_features/test_essential_features/pcluster.config.yaml index 11305366e3..4d93d06326 100644 --- a/tests/integration-tests/tests/basic/test_essential_features/test_essential_features/pcluster.config.yaml +++ b/tests/integration-tests/tests/basic/test_essential_features/test_essential_features/pcluster.config.yaml @@ -121,6 +121,16 @@ Scheduling: - InstanceType: {{ instance }} MinCount: 1 MaxCount: {{ max_queue_size }} + - Name: gpu + Networking: + SubnetIds: + - {{ private_subnet_id }} + ComputeResources: + - Name: cr1 + Instances: + - InstanceType: g4dn.2xlarge + MinCount: 0 + MaxCount: 1 SlurmSettings: ScaledownIdletime: {{ scaledown_idletime }} # Use shorter scaledown time to test logs in slurm_suspend Monitoring: From 0c357ec775eb1724de8d663bb447494e5516f9ee Mon Sep 17 00:00:00 2001 From: Giacomo Marciani Date: Wed, 20 May 2026 10:12:07 +0200 Subject: [PATCH 2/2] [Test] In test_essential_feature use flexible GPU-powered instance types to reduce the risk of ICEs. Flexible instance type are cached to reduce the number of EC2 requests. --- .../tests/basic/test_essential_features.py | 3 ++- .../test_essential_features/pcluster.config.yaml | 4 +++- tests/integration-tests/utils.py | 9 +++++++++ 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/tests/integration-tests/tests/basic/test_essential_features.py b/tests/integration-tests/tests/basic/test_essential_features.py index c260abfe76..39a3029944 100644 --- a/tests/integration-tests/tests/basic/test_essential_features.py +++ b/tests/integration-tests/tests/basic/test_essential_features.py @@ -15,7 +15,7 @@ from assertpy import assert_that, soft_assertions from constants import UNSUPPORTED_OSES_FOR_DCV from remote_command_executor import RemoteCommandExecutor -from utils import check_status, is_dcv_supported, test_cluster_health_metric +from utils import check_status, get_flexible_gpu_instance_types, is_dcv_supported, test_cluster_health_metric from tests.basic.disable_hyperthreading_utils import _test_disable_hyperthreading_settings from tests.basic.log_rotation_utils import _test_compute_log_rotation, _test_headnode_log_rotation @@ -65,6 +65,7 @@ def test_essential_features( dcv_enabled=dcv_enabled, max_queue_size=max_queue_size, scaledown_idletime=scaledown_idletime, + flexible_gpu_instance_types=get_flexible_gpu_instance_types(instance, region), ) cluster = clusters_factory(cluster_config) diff --git a/tests/integration-tests/tests/basic/test_essential_features/test_essential_features/pcluster.config.yaml b/tests/integration-tests/tests/basic/test_essential_features/test_essential_features/pcluster.config.yaml index 4d93d06326..4a5fe443ac 100644 --- a/tests/integration-tests/tests/basic/test_essential_features/test_essential_features/pcluster.config.yaml +++ b/tests/integration-tests/tests/basic/test_essential_features/test_essential_features/pcluster.config.yaml @@ -128,7 +128,9 @@ Scheduling: ComputeResources: - Name: cr1 Instances: - - InstanceType: g4dn.2xlarge + {% for instance_type in flexible_gpu_instance_types %} + - InstanceType: {{ instance_type }} + {% endfor %} MinCount: 0 MaxCount: 1 SlurmSettings: diff --git a/tests/integration-tests/utils.py b/tests/integration-tests/utils.py index 6dc310a238..18d14eec8d 100644 --- a/tests/integration-tests/utils.py +++ b/tests/integration-tests/utils.py @@ -19,6 +19,7 @@ import string import subprocess from datetime import datetime, timedelta +from functools import cache from hashlib import sha1 import boto3 @@ -1073,6 +1074,14 @@ def get_similar_instance_types(instance_type: str, region: str = None, max_items return similar_instances +@cache +def get_flexible_gpu_instance_types(instance, region): + """Return a list of NVIDIA GPU instance types compatible with ``instance``'s architecture.""" + architecture = get_architecture_supported_by_instance_type(instance, region) + gpu_instance_type = "g4dn.2xlarge" if architecture == "x86_64" else "g5g.2xlarge" + return list({gpu_instance_type, *get_similar_instance_types(gpu_instance_type, region, 5)}) + + def verify_cluster_node_config_version_in_ddb(region, cluster_name, instance_id, expected_version): """ Verify that a cluster node has the correct config version in DynamoDB.