From 28744c3d50d82cad5af7cc717d367c6917dd2ec7 Mon Sep 17 00:00:00 2001
From: Giacomo Marciani <mgiacomo@amazon.com>
Date: Tue, 19 May 2026 18:27:10 +0200
Subject: [PATCH 1/2] [Test] Extend test_essential_feature to also cover basic
 GPU workload leveraging CUDA Samples.

---
 .../tests/basic/test_essential_features.py    | 25 +++++++++
 .../test_essential_features/gpu_job.sh        | 51 +++++++++++++++++++
 .../pcluster.config.yaml                      | 10 ++++
 3 files changed, 86 insertions(+)
 create mode 100644 tests/integration-tests/tests/basic/test_essential_features/test_essential_features/gpu_job.sh
diff --git a/tests/integration-tests/tests/basic/test_essential_features.py b/tests/integration-tests/tests/basic/test_essential_features.py
index deea90c56e..c260abfe76 100644
--- a/tests/integration-tests/tests/basic/test_essential_features.py
+++ b/tests/integration-tests/tests/basic/test_essential_features.py
@@ -91,6 +91,8 @@ def test_essential_features(
         cluster, region, instance, scheduler, default_threads_per_core, request, scheduler_commands_factory
     )
 
+    _test_gpu_workload(cluster, scheduler_commands_factory, test_datadir)
+
 
 def _test_mpi_job(
     scheduler, region, instance, cluster, test_datadir, scheduler_commands_factory, scaledown_idletime, max_queue_size
@@ -331,6 +333,29 @@ def _test_custom_bootstrap_scripts_args_quotes(cluster):
     )
 
 
+def _test_gpu_workload(cluster, scheduler_commands_factory, test_datadir):
+    """Submit a Slurm job that builds and runs CUDA samples on a GPU compute node."""
+    remote_command_executor = RemoteCommandExecutor(cluster)
+    scheduler_commands = scheduler_commands_factory(remote_command_executor)
+
+    samples = ["1_Utilities/deviceQuery", "4_CUDA_Libraries/matrixMulCUBLAS"]
+    job_ids = []
+    for sample in samples:
+        logging.info("Submitting CUDA sample job for %s", sample)
+        result = scheduler_commands.submit_script(
+            str(test_datadir / "gpu_job.sh"),
+            script_args=[sample],
+            partition="gpu",
+            nodes=1,
+            slots=1,
+        )
+        job_ids.append(scheduler_commands.assert_job_submitted(result.stdout))
+
+    for job_id in job_ids:
+        scheduler_commands.wait_job_completed(job_id, timeout=20)
+        scheduler_commands.assert_job_succeeded(job_id)
+
+
 def _test_disable_hyperthreading(
     cluster, region, instance, scheduler, default_threads_per_core, request, scheduler_commands_factory
 ):
diff --git a/tests/integration-tests/tests/basic/test_essential_features/test_essential_features/gpu_job.sh b/tests/integration-tests/tests/basic/test_essential_features/test_essential_features/gpu_job.sh
new file mode 100644
index 0000000000..4e1459bd70
--- /dev/null
+++ b/tests/integration-tests/tests/basic/test_essential_features/test_essential_features/gpu_job.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+#SBATCH --job-name=cuda-gpu-validate
+#SBATCH --output=cuda-gpu-validate-%j.out
+
+# Build and run a single CUDA sample (passed as a script argument) from the
+# pre-installed /usr/local/cuda-samples-13.0 tree. CUDA samples 13.x are
+# CMake-only and /usr/local/... isn't writable, so the script copies the
+# sample into a temp dir before building.
+
+set -euo pipefail
+
+if [[ $# -ne 1 ]]; then
+    echo "Usage: sbatch $0 <category>/<sample>" >&2
+    echo "  e.g. sbatch $0 1_Utilities/deviceQuery"  >&2
+    exit 2
+fi
+SAMPLE_REL=$1
+SAMPLE_NAME=${SAMPLE_REL##*/}
+
+export PATH=/usr/local/cuda/bin:${PATH}
+
+echo "Node: $(hostname)"
+echo "Sample: $SAMPLE_REL"
+echo "SLURM_JOB_GPUS=${SLURM_JOB_GPUS:-unset}"
+echo "CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-unset}"
+nvidia-smi -L
+nvidia-smi
+nvcc --version
+
+SAMPLES_SRC=/usr/local/cuda-samples-13.0
+if [[ ! -d "$SAMPLES_SRC/Samples/$SAMPLE_REL" ]]; then
+    echo "ERROR: sample not found: $SAMPLES_SRC/Samples/$SAMPLE_REL" >&2
+    exit 2
+fi
+
+WORKDIR=$(mktemp -d)
+trap 'rm -rf "$WORKDIR"' EXIT
+
+# Shared scaffolding required by every sample (Common/, top-level cmake/)
+cp -r "$SAMPLES_SRC"/{Common,cmake,CMakeLists.txt} "$WORKDIR"/
+
+DST="$WORKDIR/Samples/$SAMPLE_REL"
+mkdir -p "$(dirname "$DST")"
+cp -r "$SAMPLES_SRC/Samples/$SAMPLE_REL" "$DST"
+
+echo "===== Building $SAMPLE_REL ====="
+cmake -S "$DST" -B "$DST/build"
+cmake --build "$DST/build" -j"${SLURM_CPUS_PER_TASK:-2}"
+
+echo "===== Running $SAMPLE_NAME ====="
+"$DST/build/$SAMPLE_NAME"
diff --git a/tests/integration-tests/tests/basic/test_essential_features/test_essential_features/pcluster.config.yaml b/tests/integration-tests/tests/basic/test_essential_features/test_essential_features/pcluster.config.yaml
index 11305366e3..4d93d06326 100644
--- a/tests/integration-tests/tests/basic/test_essential_features/test_essential_features/pcluster.config.yaml
+++ b/tests/integration-tests/tests/basic/test_essential_features/test_essential_features/pcluster.config.yaml
@@ -121,6 +121,16 @@ Scheduling:
             - InstanceType: {{ instance }}
           MinCount: 1
           MaxCount: {{ max_queue_size }}
+    - Name: gpu
+      Networking:
+        SubnetIds:
+          - {{ private_subnet_id }}
+      ComputeResources:
+        - Name: cr1
+          Instances:
+            - InstanceType: g4dn.2xlarge
+          MinCount: 0
+          MaxCount: 1
   SlurmSettings:
     ScaledownIdletime: {{ scaledown_idletime }} # Use shorter scaledown time to test logs in slurm_suspend
 Monitoring:

From 0c357ec775eb1724de8d663bb447494e5516f9ee Mon Sep 17 00:00:00 2001
From: Giacomo Marciani <mgiacomo@amazon.com>
Date: Wed, 20 May 2026 10:12:07 +0200
Subject: [PATCH 2/2] [Test] In test_essential_feature use flexible GPU-powered
 instance types to reduce the risk of ICEs.

Flexible instance type are cached to reduce the number of EC2 requests.
---
 .../tests/basic/test_essential_features.py               | 3 ++-
 .../test_essential_features/pcluster.config.yaml         | 4 +++-
 tests/integration-tests/utils.py                         | 9 +++++++++
 3 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/tests/integration-tests/tests/basic/test_essential_features.py b/tests/integration-tests/tests/basic/test_essential_features.py
index c260abfe76..39a3029944 100644
--- a/tests/integration-tests/tests/basic/test_essential_features.py
+++ b/tests/integration-tests/tests/basic/test_essential_features.py
@@ -15,7 +15,7 @@
 from assertpy import assert_that, soft_assertions
 from constants import UNSUPPORTED_OSES_FOR_DCV
 from remote_command_executor import RemoteCommandExecutor
-from utils import check_status, is_dcv_supported, test_cluster_health_metric
+from utils import check_status, get_flexible_gpu_instance_types, is_dcv_supported, test_cluster_health_metric
 
 from tests.basic.disable_hyperthreading_utils import _test_disable_hyperthreading_settings
 from tests.basic.log_rotation_utils import _test_compute_log_rotation, _test_headnode_log_rotation
@@ -65,6 +65,7 @@ def test_essential_features(
         dcv_enabled=dcv_enabled,
         max_queue_size=max_queue_size,
         scaledown_idletime=scaledown_idletime,
+        flexible_gpu_instance_types=get_flexible_gpu_instance_types(instance, region),
     )
     cluster = clusters_factory(cluster_config)
 
diff --git a/tests/integration-tests/tests/basic/test_essential_features/test_essential_features/pcluster.config.yaml b/tests/integration-tests/tests/basic/test_essential_features/test_essential_features/pcluster.config.yaml
index 4d93d06326..4a5fe443ac 100644
--- a/tests/integration-tests/tests/basic/test_essential_features/test_essential_features/pcluster.config.yaml
+++ b/tests/integration-tests/tests/basic/test_essential_features/test_essential_features/pcluster.config.yaml
@@ -128,7 +128,9 @@ Scheduling:
       ComputeResources:
         - Name: cr1
           Instances:
-            - InstanceType: g4dn.2xlarge
+            {% for instance_type in flexible_gpu_instance_types %}
+            - InstanceType: {{ instance_type }}
+            {% endfor %}
           MinCount: 0
           MaxCount: 1
   SlurmSettings:
diff --git a/tests/integration-tests/utils.py b/tests/integration-tests/utils.py
index 6dc310a238..18d14eec8d 100644
--- a/tests/integration-tests/utils.py
+++ b/tests/integration-tests/utils.py
@@ -19,6 +19,7 @@
 import string
 import subprocess
 from datetime import datetime, timedelta
+from functools import cache
 from hashlib import sha1
 
 import boto3
@@ -1073,6 +1074,14 @@ def get_similar_instance_types(instance_type: str, region: str = None, max_items
     return similar_instances
 
 
+@cache
+def get_flexible_gpu_instance_types(instance, region):
+    """Return a list of NVIDIA GPU instance types compatible with ``instance``'s architecture."""
+    architecture = get_architecture_supported_by_instance_type(instance, region)
+    gpu_instance_type = "g4dn.2xlarge" if architecture == "x86_64" else "g5g.2xlarge"
+    return list({gpu_instance_type, *get_similar_instance_types(gpu_instance_type, region, 5)})
+
+
 def verify_cluster_node_config_version_in_ddb(region, cluster_name, instance_id, expected_version):
     """
     Verify that a cluster node has the correct config version in DynamoDB.