diff --git a/superbench/common/utils/device_manager.py b/superbench/common/utils/device_manager.py index 2a594fef0..a7163e092 100644 --- a/superbench/common/utils/device_manager.py +++ b/superbench/common/utils/device_manager.py @@ -3,6 +3,7 @@ """Device Managerment Library Utility.""" +import numbers from typing import Optional from superbench.common.utils import logger @@ -15,6 +16,26 @@ elif gpu.vendor == 'amd' or gpu.vendor == 'amd-graphics': import amdsmi as rocml +# amdsmi reports power in microwatts on some ROCm versions and in watts on +# others. Any plausible per-GPU watt value is well below 100,000, while µW +# values for real cards are tens of millions, so we use a magnitude-based +# heuristic to detect µW and convert. +_AMDSMI_MICROWATTS_PER_WATT = 1_000_000 +_AMDSMI_MICROWATTS_THRESHOLD = 100_000 + + +def _amdsmi_power_to_watts(value): + """Convert an amdsmi power value to integer watts. + + Returns None if value is not a plausible numeric reading (e.g. 'N/A' or bool). + Applies the µW->W heuristic above so callers never have to guess units. + """ + if not isinstance(value, numbers.Real) or isinstance(value, bool): + return None + if value > _AMDSMI_MICROWATTS_THRESHOLD: + value = value // _AMDSMI_MICROWATTS_PER_WATT + return int(value) + class DeviceManager: """Device management base module.""" @@ -332,7 +353,14 @@ def __init__(self): def __del__(self): """Destructor.""" - rocml.amdsmi_shut_down() + # Be defensive at interpreter shutdown / partial-import time: the + # module-level ``rocml`` global may have been torn down, or may never + # have been imported (e.g., when this class is constructed via + # __new__ in tests). Swallow any error so GC never raises. + try: + rocml.amdsmi_shut_down() + except Exception: + pass def get_device_count(self): """Get the number of device. @@ -389,10 +417,19 @@ def get_device_power(self, idx): """ try: power_measure = rocml.amdsmi_get_power_info(self._device_handlers[idx]) + # amdsmi sets fields to 'N/A' when the hardware reports 0xFFFF (unsupported). + # On MI300X, average_socket_power is unsupported, so fall back to current_socket_power. + for key in ('average_socket_power', 'current_socket_power'): + if key not in power_measure: + logger.warning('amdsmi power_info missing expected key: {}'.format(key)) + continue + watts = _amdsmi_power_to_watts(power_measure[key]) + if watts is not None: + return watts + return None except Exception as err: logger.warning('Get device power failed: {}'.format(str(err))) return None - return int(power_measure['average_socket_power']) def get_device_power_limit(self, idx): """Get the power management limit of device, unit: watt. @@ -405,10 +442,13 @@ def get_device_power_limit(self, idx): """ try: power_measure = rocml.amdsmi_get_power_info(self._device_handlers[idx]) + if 'power_limit' not in power_measure: + logger.warning('amdsmi power_info missing expected key: power_limit') + return None + return _amdsmi_power_to_watts(power_measure['power_limit']) except Exception as err: logger.warning('Get device power limit failed: {}'.format(str(err))) return None - return int(power_measure['power_limit']) def get_device_memory(self, idx): """Get the memory information of device, unit: byte. diff --git a/tests/common/test_device_manager.py b/tests/common/test_device_manager.py index d78d1bf1d..aad47b0d3 100644 --- a/tests/common/test_device_manager.py +++ b/tests/common/test_device_manager.py @@ -9,6 +9,8 @@ from tests.helper import decorator from superbench.common.utils import device_manager as dm +_DM_MODULE = 'superbench.common.utils.device_manager' + @decorator.cuda_test @mock.patch('superbench.common.utils.process.run_command') @@ -52,3 +54,136 @@ def test_nvidia_helper_utils(mock_run_command): 'gpu_remap_none': 0 } assert (gpu_remapped_info == expected) + + +def _make_amd_manager(): + """Build an AmdDeviceManager instance bypassing __init__ (no ROCm required).""" + manager = dm.AmdDeviceManager.__new__(dm.AmdDeviceManager) + manager._device_handlers = [mock.Mock()] + return manager + + +def test_amd_get_device_power_average_supported(): + """average_socket_power is numeric -> returned as int.""" + manager = _make_amd_manager() + rocml_mock = mock.Mock() + rocml_mock.amdsmi_get_power_info.return_value = { + 'average_socket_power': 123.7, + 'current_socket_power': 456, + 'power_limit': 750, + } + with mock.patch(f'{_DM_MODULE}.rocml', rocml_mock, create=True): + assert manager.get_device_power(0) == 123 + + +def test_amd_get_device_power_falls_back_to_current(): + """average_socket_power='N/A' -> fall back to current_socket_power.""" + manager = _make_amd_manager() + rocml_mock = mock.Mock() + rocml_mock.amdsmi_get_power_info.return_value = { + 'average_socket_power': 'N/A', + 'current_socket_power': 321, + 'power_limit': 750, + } + with mock.patch(f'{_DM_MODULE}.rocml', rocml_mock, create=True): + assert manager.get_device_power(0) == 321 + + +def test_amd_get_device_power_both_unsupported_returns_none(): + """Both fields non-numeric -> returns None.""" + manager = _make_amd_manager() + rocml_mock = mock.Mock() + rocml_mock.amdsmi_get_power_info.return_value = { + 'average_socket_power': 'N/A', + 'current_socket_power': 'N/A', + 'power_limit': 750, + } + with mock.patch(f'{_DM_MODULE}.rocml', rocml_mock, create=True): + assert manager.get_device_power(0) is None + + +def test_amd_get_device_power_missing_keys_returns_none(): + """Missing keys -> None and warning logged (no exception).""" + manager = _make_amd_manager() + rocml_mock = mock.Mock() + rocml_mock.amdsmi_get_power_info.return_value = {} + with mock.patch(f'{_DM_MODULE}.rocml', rocml_mock, create=True): + assert manager.get_device_power(0) is None + + +def test_amd_get_device_power_microwatts_converted(): + """average_socket_power reported in µW -> converted to watts. + + Verifies the unit handling is symmetric with get_device_power_limit so the + monitor record's gpu_power and gpu_power_limit cannot drift by 1e6. + """ + manager = _make_amd_manager() + rocml_mock = mock.Mock() + rocml_mock.amdsmi_get_power_info.return_value = { + 'average_socket_power': 350_000_000, # 350 W in µW + 'current_socket_power': 360_000_000, + 'power_limit': 750_000_000, + } + with mock.patch(f'{_DM_MODULE}.rocml', rocml_mock, create=True): + assert manager.get_device_power(0) == 350 + assert manager.get_device_power_limit(0) == 750 + +def test_amd_get_device_power_limit_microwatts_converted(): + """power_limit reported in µW (e.g., 750000000) -> converted to 750 W.""" + manager = _make_amd_manager() + rocml_mock = mock.Mock() + rocml_mock.amdsmi_get_power_info.return_value = {'power_limit': 750_000_000} + with mock.patch(f'{_DM_MODULE}.rocml', rocml_mock, create=True): + assert manager.get_device_power_limit(0) == 750 + + +def test_amd_get_device_power_limit_watts_passthrough(): + """power_limit already in watts (small value) -> returned as-is.""" + manager = _make_amd_manager() + rocml_mock = mock.Mock() + rocml_mock.amdsmi_get_power_info.return_value = {'power_limit': 300} + with mock.patch(f'{_DM_MODULE}.rocml', rocml_mock, create=True): + assert manager.get_device_power_limit(0) == 300 + + +def test_amd_get_device_power_limit_non_numeric_returns_none(): + """power_limit='N/A' -> returns None.""" + manager = _make_amd_manager() + rocml_mock = mock.Mock() + rocml_mock.amdsmi_get_power_info.return_value = {'power_limit': 'N/A'} + with mock.patch(f'{_DM_MODULE}.rocml', rocml_mock, create=True): + assert manager.get_device_power_limit(0) is None + + +def test_amd_get_device_power_limit_missing_key_returns_none(): + """Missing power_limit key -> returns None without raising.""" + manager = _make_amd_manager() + rocml_mock = mock.Mock() + rocml_mock.amdsmi_get_power_info.return_value = {} + with mock.patch(f'{_DM_MODULE}.rocml', rocml_mock, create=True): + assert manager.get_device_power_limit(0) is None + + +def test_amd_device_manager_lifecycle(): + """__init__ calls amdsmi_init/get_processor_handles; __del__ tolerates failures. + + Lifecycle is important: a regression in __del__ would surface as noisy + NameError / AttributeError messages in benchmark logs at interpreter shutdown. + """ + rocml_mock = mock.Mock() + rocml_mock.amdsmi_get_processor_handles.return_value = ['h0', 'h1'] + with mock.patch(f'{_DM_MODULE}.rocml', rocml_mock, create=True): + manager = dm.AmdDeviceManager() + rocml_mock.amdsmi_init.assert_called_once() + assert manager.get_device_count() == 2 + manager.__del__() + rocml_mock.amdsmi_shut_down.assert_called_once() + + # Simulate the destructor running when amdsmi has been torn down (e.g., + # interpreter shutdown). It must swallow the error rather than raise. + manager2 = dm.AmdDeviceManager.__new__(dm.AmdDeviceManager) + manager2._device_handlers = [] + bad_rocml = mock.Mock() + bad_rocml.amdsmi_shut_down.side_effect = RuntimeError('rocm gone') + with mock.patch(f'{_DM_MODULE}.rocml', bad_rocml, create=True): + manager2.__del__() # must not raise