From a7f87e2f6740979b4f21ccd9916fbe4868b631b4 Mon Sep 17 00:00:00 2001 From: Aayush Joglekar Date: Wed, 6 May 2026 17:09:59 +0200 Subject: [PATCH 1/6] Ignore ROCm-LLVM on aarch64 --- eb_hooks.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/eb_hooks.py b/eb_hooks.py index 196b8c26..46841923 100644 --- a/eb_hooks.py +++ b/eb_hooks.py @@ -55,6 +55,7 @@ # Make sure a single environment variable name is used for this throughout the hooks EESSI_IGNORE_ZEN4_GCC1220_ENVVAR="EESSI_IGNORE_LMOD_ERROR_ZEN4_GCC1220" +EESSI_IGNORE_AARCH64_ROCMLLVM641_ENVVAR="EESSI_IGNORE_LMOD_ERROR_AARCH64_ROCMLLVM641" STACK_REPROD_SUBDIR = 'reprod' @@ -840,6 +841,19 @@ def is_unsupported_module(self): setattr(self, EESSI_UNSUPPORTED_MODULE_ATTR, UnsupportedModule(envvar=var,errmsg=errmsg)) return True + # ROCm-LLVM 6.4.1 is not supported on aarch64 + # see: https://github.com/EESSI/software-layer/pull/1473#issuecomment-4370846033 + if not os.getenv("EESSI_OVERRIDE_ROCM_VERSION_CHECK"): + if ec.name == 'ROCm-LLVM' and ec.version in ('6.4.1',): + if get_eessi_envvar('EESSI_CPU_FAMILY') == 'aarch64': + msg = "ROCm-LLVM/6.4.1 is not supported on aarch64 architectures. " + msg += "Building with '--module-only --force' and injecting an LmodError into the modulefile." + msg += "You can override this behaviour by setting the EESSI_OVERRIDE_ROCM_VERSION_CHECK environment variable." + print_warning(msg) + var=EESSI_IGNORE_AARCH64_ROCMLLVM641_ENVVAR + setattr(self, EESSI_UNSUPPORTED_MODULE_ATTR, UnsupportedModule(envvar=var, errmsg=errmsg)) + return True + # If all the above logic passed, this module is supported setattr(self, EESSI_SUPPORTED_MODULE_ATTR, True) return False From 0e2505bd795d6e0b11a4ad718e130480c6adb44a Mon Sep 17 00:00:00 2001 From: Aayush Joglekar Date: Wed, 6 May 2026 17:21:00 +0200 Subject: [PATCH 2/6] Fix name and version fetch --- eb_hooks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eb_hooks.py b/eb_hooks.py index 46841923..f22b4a9e 100644 --- a/eb_hooks.py +++ b/eb_hooks.py @@ -844,7 +844,7 @@ def is_unsupported_module(self): # ROCm-LLVM 6.4.1 is not supported on aarch64 # see: https://github.com/EESSI/software-layer/pull/1473#issuecomment-4370846033 if not os.getenv("EESSI_OVERRIDE_ROCM_VERSION_CHECK"): - if ec.name == 'ROCm-LLVM' and ec.version in ('6.4.1',): + if self.cfg.name == 'ROCm-LLVM' and self.cfg.version in ('6.4.1',): if get_eessi_envvar('EESSI_CPU_FAMILY') == 'aarch64': msg = "ROCm-LLVM/6.4.1 is not supported on aarch64 architectures. " msg += "Building with '--module-only --force' and injecting an LmodError into the modulefile." From e8e31e1aec99450d8c7f457f8a581494a237c364 Mon Sep 17 00:00:00 2001 From: Aayush Joglekar Date: Wed, 6 May 2026 22:58:41 +0300 Subject: [PATCH 3/6] Add errmsg --- eb_hooks.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/eb_hooks.py b/eb_hooks.py index f22b4a9e..7225c254 100644 --- a/eb_hooks.py +++ b/eb_hooks.py @@ -846,10 +846,11 @@ def is_unsupported_module(self): if not os.getenv("EESSI_OVERRIDE_ROCM_VERSION_CHECK"): if self.cfg.name == 'ROCm-LLVM' and self.cfg.version in ('6.4.1',): if get_eessi_envvar('EESSI_CPU_FAMILY') == 'aarch64': - msg = "ROCm-LLVM/6.4.1 is not supported on aarch64 architectures. " msg += "Building with '--module-only --force' and injecting an LmodError into the modulefile." msg += "You can override this behaviour by setting the EESSI_OVERRIDE_ROCM_VERSION_CHECK environment variable." print_warning(msg) + errmsg = "ROCm-LLVM/6.4.1 is not supported on the aarch64 architecture." + errmsg += "For more details, see: https://github.com/EESSI/software-layer/pull/1473#issuecomment-4370846033" var=EESSI_IGNORE_AARCH64_ROCMLLVM641_ENVVAR setattr(self, EESSI_UNSUPPORTED_MODULE_ATTR, UnsupportedModule(envvar=var, errmsg=errmsg)) return True From 08147035481535b74bd0c314bbdc497f02ae6747 Mon Sep 17 00:00:00 2001 From: Aayush Joglekar Date: Thu, 7 May 2026 15:45:08 +0300 Subject: [PATCH 4/6] Remove hook and add EESSI warning for aarch+amdgpu --- eb_hooks.py | 15 --------------- init/modules/EESSI/2023.06.lua | 3 +++ 2 files changed, 3 insertions(+), 15 deletions(-) diff --git a/eb_hooks.py b/eb_hooks.py index 7225c254..196b8c26 100644 --- a/eb_hooks.py +++ b/eb_hooks.py @@ -55,7 +55,6 @@ # Make sure a single environment variable name is used for this throughout the hooks EESSI_IGNORE_ZEN4_GCC1220_ENVVAR="EESSI_IGNORE_LMOD_ERROR_ZEN4_GCC1220" -EESSI_IGNORE_AARCH64_ROCMLLVM641_ENVVAR="EESSI_IGNORE_LMOD_ERROR_AARCH64_ROCMLLVM641" STACK_REPROD_SUBDIR = 'reprod' @@ -841,20 +840,6 @@ def is_unsupported_module(self): setattr(self, EESSI_UNSUPPORTED_MODULE_ATTR, UnsupportedModule(envvar=var,errmsg=errmsg)) return True - # ROCm-LLVM 6.4.1 is not supported on aarch64 - # see: https://github.com/EESSI/software-layer/pull/1473#issuecomment-4370846033 - if not os.getenv("EESSI_OVERRIDE_ROCM_VERSION_CHECK"): - if self.cfg.name == 'ROCm-LLVM' and self.cfg.version in ('6.4.1',): - if get_eessi_envvar('EESSI_CPU_FAMILY') == 'aarch64': - msg += "Building with '--module-only --force' and injecting an LmodError into the modulefile." - msg += "You can override this behaviour by setting the EESSI_OVERRIDE_ROCM_VERSION_CHECK environment variable." - print_warning(msg) - errmsg = "ROCm-LLVM/6.4.1 is not supported on the aarch64 architecture." - errmsg += "For more details, see: https://github.com/EESSI/software-layer/pull/1473#issuecomment-4370846033" - var=EESSI_IGNORE_AARCH64_ROCMLLVM641_ENVVAR - setattr(self, EESSI_UNSUPPORTED_MODULE_ATTR, UnsupportedModule(envvar=var, errmsg=errmsg)) - return True - # If all the above logic passed, this module is supported setattr(self, EESSI_SUPPORTED_MODULE_ATTR, True) return False diff --git a/init/modules/EESSI/2023.06.lua b/init/modules/EESSI/2023.06.lua index fca0d69c..b464d82b 100644 --- a/init/modules/EESSI/2023.06.lua +++ b/init/modules/EESSI/2023.06.lua @@ -180,6 +180,9 @@ if not (archdetect_accel == nil or archdetect_accel == '') then -- /cvmfs/software.eessi.io/versions//software/linux/x86_64/amd/zen3/accel/nvidia/cc80/modules/all eessi_module_path_accel = pathJoin(eessi_accel_software_path, archdetect_accel, eessi_modules_subdir) eessiDebug("Checking if " .. eessi_module_path_accel .. " exists") + if eessi_cpu_family == "aarch64" and archdetect_accel:match("^accel/amd/") then + LmodMessage("aarch64 CPU detected, no support for AMD GPUs in production repository yet.") + end if not isDir(eessi_module_path_accel) then -- fall back to major version GPU arch if the exact one is not an option (i.e, 7.5 -> 7.0) local original_archdetect_accel = archdetect_accel From 528821afeec8fca67b0400681890e8d2bebc7c33 Mon Sep 17 00:00:00 2001 From: Aayush Joglekar Date: Thu, 7 May 2026 17:40:35 +0200 Subject: [PATCH 5/6] Change error message --- init/modules/EESSI/2023.06.lua | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/init/modules/EESSI/2023.06.lua b/init/modules/EESSI/2023.06.lua index b464d82b..1cbc9c88 100644 --- a/init/modules/EESSI/2023.06.lua +++ b/init/modules/EESSI/2023.06.lua @@ -181,7 +181,7 @@ if not (archdetect_accel == nil or archdetect_accel == '') then eessi_module_path_accel = pathJoin(eessi_accel_software_path, archdetect_accel, eessi_modules_subdir) eessiDebug("Checking if " .. eessi_module_path_accel .. " exists") if eessi_cpu_family == "aarch64" and archdetect_accel:match("^accel/amd/") then - LmodMessage("aarch64 CPU detected, no support for AMD GPUs in production repository yet.") + LmodMessage("aarch64 CPU detected, AMD ROCm doesn't support aarch64 yet") end if not isDir(eessi_module_path_accel) then -- fall back to major version GPU arch if the exact one is not an option (i.e, 7.5 -> 7.0) From 8539813d27081e64217b3a7b15102765caf018df Mon Sep 17 00:00:00 2001 From: Aayush Joglekar Date: Thu, 7 May 2026 20:59:44 +0200 Subject: [PATCH 6/6] Update 2023.06.lua Co-authored-by: Caspar van Leeuwen <33718780+casparvl@users.noreply.github.com> --- init/modules/EESSI/2023.06.lua | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/init/modules/EESSI/2023.06.lua b/init/modules/EESSI/2023.06.lua index 1cbc9c88..45814cbc 100644 --- a/init/modules/EESSI/2023.06.lua +++ b/init/modules/EESSI/2023.06.lua @@ -181,7 +181,7 @@ if not (archdetect_accel == nil or archdetect_accel == '') then eessi_module_path_accel = pathJoin(eessi_accel_software_path, archdetect_accel, eessi_modules_subdir) eessiDebug("Checking if " .. eessi_module_path_accel .. " exists") if eessi_cpu_family == "aarch64" and archdetect_accel:match("^accel/amd/") then - LmodMessage("aarch64 CPU detected, AMD ROCm doesn't support aarch64 yet") + LmodMessage("AArch64 CPU architecture and AMD GPU detected. However, AMD ROCm does not support AArch64. Thus, AMD GPU-enabled EESSI installations are not available on this system.") end if not isDir(eessi_module_path_accel) then -- fall back to major version GPU arch if the exact one is not an option (i.e, 7.5 -> 7.0)