From 85377ade073e32fe75c35ebdbbb2937cbb4505d9 Mon Sep 17 00:00:00 2001 From: Hongtao Zhang Date: Wed, 25 Mar 2026 22:59:58 +0000 Subject: [PATCH 1/4] Fix amd device manager. --- superbench/common/utils/device_manager.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/superbench/common/utils/device_manager.py b/superbench/common/utils/device_manager.py index 2a594fef0..f49936885 100644 --- a/superbench/common/utils/device_manager.py +++ b/superbench/common/utils/device_manager.py @@ -389,10 +389,17 @@ def get_device_power(self, idx): """ try: power_measure = rocml.amdsmi_get_power_info(self._device_handlers[idx]) + # amdsmi sets fields to 'N/A' when the hardware reports 0xFFFF (unsupported). + # On MI300X, average_socket_power is unsupported, so fall back to current_socket_power. + power = power_measure.get('average_socket_power') + if not isinstance(power, (int, float)): + power = power_measure.get('current_socket_power') + if not isinstance(power, (int, float)): + return None + return int(power) except Exception as err: logger.warning('Get device power failed: {}'.format(str(err))) return None - return int(power_measure['average_socket_power']) def get_device_power_limit(self, idx): """Get the power management limit of device, unit: watt. @@ -405,10 +412,16 @@ def get_device_power_limit(self, idx): """ try: power_measure = rocml.amdsmi_get_power_info(self._device_handlers[idx]) + power_limit = power_measure.get('power_limit') + if not isinstance(power_limit, (int, float)): + return None + # amdsmi returns power_limit in microwatts (e.g. 750000000 for 750W), convert to watts. + if power_limit > 100000: + power_limit = power_limit // 1000000 + return int(power_limit) except Exception as err: logger.warning('Get device power limit failed: {}'.format(str(err))) return None - return int(power_measure['power_limit']) def get_device_memory(self, idx): """Get the memory information of device, unit: byte. From 439cddc76012c38b0900aa516d97cf5bf9b471a7 Mon Sep 17 00:00:00 2001 From: Hongtao Zhang Date: Mon, 27 Apr 2026 21:57:14 +0000 Subject: [PATCH 2/4] Address PR review comments for AMD power monitoring MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Use numbers.Real instead of (int, float) so numeric types like numpy scalars and ctypes ints are accepted, while still rejecting bool. - Explicitly check for missing keys in amdsmi power_info and log a warning so monitoring failures are diagnosable instead of silently returning None. - Replace the magic threshold 100000 with named constants _AMDSMI_MICROWATTS_PER_WATT and _AMDSMI_MICROWATTS_THRESHOLD, with a clearer comment explaining the µW vs W detection heuristic. - Add unit tests covering: average_socket_power supported, fallback to current_socket_power on 'N/A', both unsupported -> None, missing keys -> None, power_limit µW->W conversion, watts passthrough, non-numeric power_limit -> None, and missing power_limit key -> None. --- superbench/common/utils/device_manager.py | 40 ++++++--- tests/common/test_device_manager.py | 101 ++++++++++++++++++++++ 2 files changed, 130 insertions(+), 11 deletions(-) diff --git a/superbench/common/utils/device_manager.py b/superbench/common/utils/device_manager.py index f49936885..a9a47ab01 100644 --- a/superbench/common/utils/device_manager.py +++ b/superbench/common/utils/device_manager.py @@ -3,6 +3,7 @@ """Device Managerment Library Utility.""" +import numbers from typing import Optional from superbench.common.utils import logger @@ -15,6 +16,12 @@ elif gpu.vendor == 'amd' or gpu.vendor == 'amd-graphics': import amdsmi as rocml +# amdsmi reports power in microwatts; convert to watts when the raw value +# is at or above this threshold (any plausible per-GPU watt value is well +# below 100000, while µW values for real cards are tens of millions). +_AMDSMI_MICROWATTS_PER_WATT = 1_000_000 +_AMDSMI_MICROWATTS_THRESHOLD = 100_000 + class DeviceManager: """Device management base module.""" @@ -391,12 +398,19 @@ def get_device_power(self, idx): power_measure = rocml.amdsmi_get_power_info(self._device_handlers[idx]) # amdsmi sets fields to 'N/A' when the hardware reports 0xFFFF (unsupported). # On MI300X, average_socket_power is unsupported, so fall back to current_socket_power. - power = power_measure.get('average_socket_power') - if not isinstance(power, (int, float)): - power = power_measure.get('current_socket_power') - if not isinstance(power, (int, float)): - return None - return int(power) + for key in ('average_socket_power', 'current_socket_power'): + if key not in power_measure: + logger.warning('amdsmi power_info missing expected key: {}'.format(key)) + continue + power = power_measure[key] + if isinstance(power, numbers.Real) and not isinstance(power, bool): + try: + return int(power) + except (TypeError, ValueError) as conv_err: + logger.warning( + 'Failed to convert amdsmi {} value {!r} to int: {}'.format(key, power, conv_err) + ) + return None except Exception as err: logger.warning('Get device power failed: {}'.format(str(err))) return None @@ -412,12 +426,16 @@ def get_device_power_limit(self, idx): """ try: power_measure = rocml.amdsmi_get_power_info(self._device_handlers[idx]) - power_limit = power_measure.get('power_limit') - if not isinstance(power_limit, (int, float)): + if 'power_limit' not in power_measure: + logger.warning('amdsmi power_info missing expected key: power_limit') + return None + power_limit = power_measure['power_limit'] + if not isinstance(power_limit, numbers.Real) or isinstance(power_limit, bool): return None - # amdsmi returns power_limit in microwatts (e.g. 750000000 for 750W), convert to watts. - if power_limit > 100000: - power_limit = power_limit // 1000000 + # amdsmi returns power_limit in microwatts (e.g. 750000000 for 750W) on some + # ROCm versions and in watts on others. Detect µW by magnitude and convert. + if power_limit >= _AMDSMI_MICROWATTS_THRESHOLD: + power_limit = power_limit // _AMDSMI_MICROWATTS_PER_WATT return int(power_limit) except Exception as err: logger.warning('Get device power limit failed: {}'.format(str(err))) diff --git a/tests/common/test_device_manager.py b/tests/common/test_device_manager.py index d78d1bf1d..b50d5cfb1 100644 --- a/tests/common/test_device_manager.py +++ b/tests/common/test_device_manager.py @@ -3,12 +3,22 @@ """Tests for nvidia_helper module.""" +import sys import numbers from unittest import mock from tests.helper import decorator from superbench.common.utils import device_manager as dm +# Ensure the real device_manager module is loaded (bypassing the LazyImport +# proxy that ``dm`` may be) and inject a placeholder ``rocml`` so that +# AmdDeviceManager.__del__ does not raise NameError during GC of test +# instances on non-ROCm hosts. +import superbench.common.utils.device_manager as _dm_real # noqa: E402, F401 + +_DM_MODULE = 'superbench.common.utils.device_manager' +if not hasattr(sys.modules[_DM_MODULE], 'rocml'): + sys.modules[_DM_MODULE].rocml = mock.Mock() @decorator.cuda_test @mock.patch('superbench.common.utils.process.run_command') @@ -52,3 +62,94 @@ def test_nvidia_helper_utils(mock_run_command): 'gpu_remap_none': 0 } assert (gpu_remapped_info == expected) + + +def _make_amd_manager(): + """Build an AmdDeviceManager instance bypassing __init__ (no ROCm required).""" + manager = dm.AmdDeviceManager.__new__(dm.AmdDeviceManager) + manager._device_handlers = [mock.Mock()] + return manager + + +def test_amd_get_device_power_average_supported(): + """average_socket_power is numeric -> returned as int.""" + manager = _make_amd_manager() + rocml_mock = mock.Mock() + rocml_mock.amdsmi_get_power_info.return_value = { + 'average_socket_power': 123.7, + 'current_socket_power': 456, + 'power_limit': 750, + } + with mock.patch(f'{_DM_MODULE}.rocml', rocml_mock, create=True): + assert manager.get_device_power(0) == 123 + + +def test_amd_get_device_power_falls_back_to_current(): + """average_socket_power='N/A' -> fall back to current_socket_power.""" + manager = _make_amd_manager() + rocml_mock = mock.Mock() + rocml_mock.amdsmi_get_power_info.return_value = { + 'average_socket_power': 'N/A', + 'current_socket_power': 321, + 'power_limit': 750, + } + with mock.patch(f'{_DM_MODULE}.rocml', rocml_mock, create=True): + assert manager.get_device_power(0) == 321 + + +def test_amd_get_device_power_both_unsupported_returns_none(): + """Both fields non-numeric -> returns None.""" + manager = _make_amd_manager() + rocml_mock = mock.Mock() + rocml_mock.amdsmi_get_power_info.return_value = { + 'average_socket_power': 'N/A', + 'current_socket_power': 'N/A', + 'power_limit': 750, + } + with mock.patch(f'{_DM_MODULE}.rocml', rocml_mock, create=True): + assert manager.get_device_power(0) is None + + +def test_amd_get_device_power_missing_keys_returns_none(): + """Missing keys -> None and warning logged (no exception).""" + manager = _make_amd_manager() + rocml_mock = mock.Mock() + rocml_mock.amdsmi_get_power_info.return_value = {} + with mock.patch(f'{_DM_MODULE}.rocml', rocml_mock, create=True): + assert manager.get_device_power(0) is None + + +def test_amd_get_device_power_limit_microwatts_converted(): + """power_limit reported in µW (e.g., 750000000) -> converted to 750 W.""" + manager = _make_amd_manager() + rocml_mock = mock.Mock() + rocml_mock.amdsmi_get_power_info.return_value = {'power_limit': 750_000_000} + with mock.patch(f'{_DM_MODULE}.rocml', rocml_mock, create=True): + assert manager.get_device_power_limit(0) == 750 + + +def test_amd_get_device_power_limit_watts_passthrough(): + """power_limit already in watts (small value) -> returned as-is.""" + manager = _make_amd_manager() + rocml_mock = mock.Mock() + rocml_mock.amdsmi_get_power_info.return_value = {'power_limit': 300} + with mock.patch(f'{_DM_MODULE}.rocml', rocml_mock, create=True): + assert manager.get_device_power_limit(0) == 300 + + +def test_amd_get_device_power_limit_non_numeric_returns_none(): + """power_limit='N/A' -> returns None.""" + manager = _make_amd_manager() + rocml_mock = mock.Mock() + rocml_mock.amdsmi_get_power_info.return_value = {'power_limit': 'N/A'} + with mock.patch(f'{_DM_MODULE}.rocml', rocml_mock, create=True): + assert manager.get_device_power_limit(0) is None + + +def test_amd_get_device_power_limit_missing_key_returns_none(): + """Missing power_limit key -> returns None without raising.""" + manager = _make_amd_manager() + rocml_mock = mock.Mock() + rocml_mock.amdsmi_get_power_info.return_value = {} + with mock.patch(f'{_DM_MODULE}.rocml', rocml_mock, create=True): + assert manager.get_device_power_limit(0) is None From 94a360018c461bfe9b07fdca7a7548810f5142f8 Mon Sep 17 00:00:00 2001 From: Hongtao Zhang Date: Mon, 27 Apr 2026 22:04:25 +0000 Subject: [PATCH 3/4] Use strict > threshold to preserve original boundary behavior at 100000 --- superbench/common/utils/device_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/superbench/common/utils/device_manager.py b/superbench/common/utils/device_manager.py index a9a47ab01..1a86d0150 100644 --- a/superbench/common/utils/device_manager.py +++ b/superbench/common/utils/device_manager.py @@ -434,7 +434,7 @@ def get_device_power_limit(self, idx): return None # amdsmi returns power_limit in microwatts (e.g. 750000000 for 750W) on some # ROCm versions and in watts on others. Detect µW by magnitude and convert. - if power_limit >= _AMDSMI_MICROWATTS_THRESHOLD: + if power_limit > _AMDSMI_MICROWATTS_THRESHOLD: power_limit = power_limit // _AMDSMI_MICROWATTS_PER_WATT return int(power_limit) except Exception as err: From fd8f3bff7f25abf375a7fd1e408c29779c0b0144 Mon Sep 17 00:00:00 2001 From: Hongtao Zhang Date: Sun, 3 May 2026 05:01:12 +0000 Subject: [PATCH 4/4] Address review feedback for AMD power monitoring MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Source: - Extract _amdsmi_power_to_watts helper that applies the microwatts->watts heuristic uniformly. This makes get_device_power and get_device_power_limit symmetric: previously the limit was µW-corrected but the live power was not, so on amdsmi versions reporting power in µW the monitored gpu_power could be off from gpu_power_limit by a factor of 1e6. - Make AmdDeviceManager.__del__ defensive: swallow exceptions from amdsmi_shut_down so interpreter-shutdown / partial-import scenarios cannot raise (e.g., NameError when the module global has been torn down). This is the proper place for the fix that the test was previously working around via sys.modules injection. Tests: - Drop the sys.modules-level rocml injection (no longer needed once __del__ is safe). - Add test_amd_get_device_power_microwatts_converted to lock in the symmetric unit handling for get_device_power. - Add test_amd_device_manager_lifecycle exercising __init__/__del__ through normal construction and verifying __del__ tolerates a raising amdsmi_shut_down. --- superbench/common/utils/device_manager.py | 49 ++++++++++++--------- tests/common/test_device_manager.py | 52 +++++++++++++++++++---- 2 files changed, 72 insertions(+), 29 deletions(-) diff --git a/superbench/common/utils/device_manager.py b/superbench/common/utils/device_manager.py index 1a86d0150..a7163e092 100644 --- a/superbench/common/utils/device_manager.py +++ b/superbench/common/utils/device_manager.py @@ -16,13 +16,27 @@ elif gpu.vendor == 'amd' or gpu.vendor == 'amd-graphics': import amdsmi as rocml -# amdsmi reports power in microwatts; convert to watts when the raw value -# is at or above this threshold (any plausible per-GPU watt value is well -# below 100000, while µW values for real cards are tens of millions). +# amdsmi reports power in microwatts on some ROCm versions and in watts on +# others. Any plausible per-GPU watt value is well below 100,000, while µW +# values for real cards are tens of millions, so we use a magnitude-based +# heuristic to detect µW and convert. _AMDSMI_MICROWATTS_PER_WATT = 1_000_000 _AMDSMI_MICROWATTS_THRESHOLD = 100_000 +def _amdsmi_power_to_watts(value): + """Convert an amdsmi power value to integer watts. + + Returns None if value is not a plausible numeric reading (e.g. 'N/A' or bool). + Applies the µW->W heuristic above so callers never have to guess units. + """ + if not isinstance(value, numbers.Real) or isinstance(value, bool): + return None + if value > _AMDSMI_MICROWATTS_THRESHOLD: + value = value // _AMDSMI_MICROWATTS_PER_WATT + return int(value) + + class DeviceManager: """Device management base module.""" def __init__(self): @@ -339,7 +353,14 @@ def __init__(self): def __del__(self): """Destructor.""" - rocml.amdsmi_shut_down() + # Be defensive at interpreter shutdown / partial-import time: the + # module-level ``rocml`` global may have been torn down, or may never + # have been imported (e.g., when this class is constructed via + # __new__ in tests). Swallow any error so GC never raises. + try: + rocml.amdsmi_shut_down() + except Exception: + pass def get_device_count(self): """Get the number of device. @@ -402,14 +423,9 @@ def get_device_power(self, idx): if key not in power_measure: logger.warning('amdsmi power_info missing expected key: {}'.format(key)) continue - power = power_measure[key] - if isinstance(power, numbers.Real) and not isinstance(power, bool): - try: - return int(power) - except (TypeError, ValueError) as conv_err: - logger.warning( - 'Failed to convert amdsmi {} value {!r} to int: {}'.format(key, power, conv_err) - ) + watts = _amdsmi_power_to_watts(power_measure[key]) + if watts is not None: + return watts return None except Exception as err: logger.warning('Get device power failed: {}'.format(str(err))) @@ -429,14 +445,7 @@ def get_device_power_limit(self, idx): if 'power_limit' not in power_measure: logger.warning('amdsmi power_info missing expected key: power_limit') return None - power_limit = power_measure['power_limit'] - if not isinstance(power_limit, numbers.Real) or isinstance(power_limit, bool): - return None - # amdsmi returns power_limit in microwatts (e.g. 750000000 for 750W) on some - # ROCm versions and in watts on others. Detect µW by magnitude and convert. - if power_limit > _AMDSMI_MICROWATTS_THRESHOLD: - power_limit = power_limit // _AMDSMI_MICROWATTS_PER_WATT - return int(power_limit) + return _amdsmi_power_to_watts(power_measure['power_limit']) except Exception as err: logger.warning('Get device power limit failed: {}'.format(str(err))) return None diff --git a/tests/common/test_device_manager.py b/tests/common/test_device_manager.py index b50d5cfb1..aad47b0d3 100644 --- a/tests/common/test_device_manager.py +++ b/tests/common/test_device_manager.py @@ -3,22 +3,14 @@ """Tests for nvidia_helper module.""" -import sys import numbers from unittest import mock from tests.helper import decorator from superbench.common.utils import device_manager as dm -# Ensure the real device_manager module is loaded (bypassing the LazyImport -# proxy that ``dm`` may be) and inject a placeholder ``rocml`` so that -# AmdDeviceManager.__del__ does not raise NameError during GC of test -# instances on non-ROCm hosts. -import superbench.common.utils.device_manager as _dm_real # noqa: E402, F401 - _DM_MODULE = 'superbench.common.utils.device_manager' -if not hasattr(sys.modules[_DM_MODULE], 'rocml'): - sys.modules[_DM_MODULE].rocml = mock.Mock() + @decorator.cuda_test @mock.patch('superbench.common.utils.process.run_command') @@ -119,6 +111,23 @@ def test_amd_get_device_power_missing_keys_returns_none(): assert manager.get_device_power(0) is None +def test_amd_get_device_power_microwatts_converted(): + """average_socket_power reported in µW -> converted to watts. + + Verifies the unit handling is symmetric with get_device_power_limit so the + monitor record's gpu_power and gpu_power_limit cannot drift by 1e6. + """ + manager = _make_amd_manager() + rocml_mock = mock.Mock() + rocml_mock.amdsmi_get_power_info.return_value = { + 'average_socket_power': 350_000_000, # 350 W in µW + 'current_socket_power': 360_000_000, + 'power_limit': 750_000_000, + } + with mock.patch(f'{_DM_MODULE}.rocml', rocml_mock, create=True): + assert manager.get_device_power(0) == 350 + assert manager.get_device_power_limit(0) == 750 + def test_amd_get_device_power_limit_microwatts_converted(): """power_limit reported in µW (e.g., 750000000) -> converted to 750 W.""" manager = _make_amd_manager() @@ -153,3 +162,28 @@ def test_amd_get_device_power_limit_missing_key_returns_none(): rocml_mock.amdsmi_get_power_info.return_value = {} with mock.patch(f'{_DM_MODULE}.rocml', rocml_mock, create=True): assert manager.get_device_power_limit(0) is None + + +def test_amd_device_manager_lifecycle(): + """__init__ calls amdsmi_init/get_processor_handles; __del__ tolerates failures. + + Lifecycle is important: a regression in __del__ would surface as noisy + NameError / AttributeError messages in benchmark logs at interpreter shutdown. + """ + rocml_mock = mock.Mock() + rocml_mock.amdsmi_get_processor_handles.return_value = ['h0', 'h1'] + with mock.patch(f'{_DM_MODULE}.rocml', rocml_mock, create=True): + manager = dm.AmdDeviceManager() + rocml_mock.amdsmi_init.assert_called_once() + assert manager.get_device_count() == 2 + manager.__del__() + rocml_mock.amdsmi_shut_down.assert_called_once() + + # Simulate the destructor running when amdsmi has been torn down (e.g., + # interpreter shutdown). It must swallow the error rather than raise. + manager2 = dm.AmdDeviceManager.__new__(dm.AmdDeviceManager) + manager2._device_handlers = [] + bad_rocml = mock.Mock() + bad_rocml.amdsmi_shut_down.side_effect = RuntimeError('rocm gone') + with mock.patch(f'{_DM_MODULE}.rocml', bad_rocml, create=True): + manager2.__del__() # must not raise