Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 43 additions & 3 deletions superbench/common/utils/device_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

"""Device Managerment Library Utility."""

import numbers
from typing import Optional

from superbench.common.utils import logger
Expand All @@ -15,6 +16,26 @@
elif gpu.vendor == 'amd' or gpu.vendor == 'amd-graphics':
import amdsmi as rocml

# amdsmi reports power in microwatts on some ROCm versions and in watts on
# others. Any plausible per-GPU watt value is well below 100,000, while µW
# values for real cards are tens of millions, so we use a magnitude-based
# heuristic to detect µW and convert.
_AMDSMI_MICROWATTS_PER_WATT = 1_000_000
_AMDSMI_MICROWATTS_THRESHOLD = 100_000


def _amdsmi_power_to_watts(value):
"""Convert an amdsmi power value to integer watts.

Returns None if value is not a plausible numeric reading (e.g. 'N/A' or bool).
Applies the µW->W heuristic above so callers never have to guess units.
"""
if not isinstance(value, numbers.Real) or isinstance(value, bool):
return None
if value > _AMDSMI_MICROWATTS_THRESHOLD:
value = value // _AMDSMI_MICROWATTS_PER_WATT
return int(value)


class DeviceManager:
"""Device management base module."""
Expand Down Expand Up @@ -332,7 +353,14 @@ def __init__(self):

def __del__(self):
"""Destructor."""
rocml.amdsmi_shut_down()
# Be defensive at interpreter shutdown / partial-import time: the
# module-level ``rocml`` global may have been torn down, or may never
# have been imported (e.g., when this class is constructed via
# __new__ in tests). Swallow any error so GC never raises.
try:
rocml.amdsmi_shut_down()
except Exception:
pass

def get_device_count(self):
"""Get the number of device.
Expand Down Expand Up @@ -389,10 +417,19 @@ def get_device_power(self, idx):
"""
try:
power_measure = rocml.amdsmi_get_power_info(self._device_handlers[idx])
# amdsmi sets fields to 'N/A' when the hardware reports 0xFFFF (unsupported).
# On MI300X, average_socket_power is unsupported, so fall back to current_socket_power.
for key in ('average_socket_power', 'current_socket_power'):
if key not in power_measure:
logger.warning('amdsmi power_info missing expected key: {}'.format(key))
continue
watts = _amdsmi_power_to_watts(power_measure[key])
if watts is not None:
return watts
return None
except Exception as err:
logger.warning('Get device power failed: {}'.format(str(err)))
return None
return int(power_measure['average_socket_power'])

def get_device_power_limit(self, idx):
"""Get the power management limit of device, unit: watt.
Expand All @@ -405,10 +442,13 @@ def get_device_power_limit(self, idx):
"""
try:
power_measure = rocml.amdsmi_get_power_info(self._device_handlers[idx])
if 'power_limit' not in power_measure:
logger.warning('amdsmi power_info missing expected key: power_limit')
return None
return _amdsmi_power_to_watts(power_measure['power_limit'])
except Exception as err:
logger.warning('Get device power limit failed: {}'.format(str(err)))
return None
return int(power_measure['power_limit'])

def get_device_memory(self, idx):
"""Get the memory information of device, unit: byte.
Expand Down
135 changes: 135 additions & 0 deletions tests/common/test_device_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
from tests.helper import decorator
from superbench.common.utils import device_manager as dm

_DM_MODULE = 'superbench.common.utils.device_manager'

Comment on lines +12 to +13

@decorator.cuda_test
@mock.patch('superbench.common.utils.process.run_command')
Expand Down Expand Up @@ -52,3 +54,136 @@ def test_nvidia_helper_utils(mock_run_command):
'gpu_remap_none': 0
}
assert (gpu_remapped_info == expected)


def _make_amd_manager():
"""Build an AmdDeviceManager instance bypassing __init__ (no ROCm required)."""
manager = dm.AmdDeviceManager.__new__(dm.AmdDeviceManager)
manager._device_handlers = [mock.Mock()]
return manager


def test_amd_get_device_power_average_supported():
"""average_socket_power is numeric -> returned as int."""
manager = _make_amd_manager()
rocml_mock = mock.Mock()
rocml_mock.amdsmi_get_power_info.return_value = {
'average_socket_power': 123.7,
'current_socket_power': 456,
'power_limit': 750,
}
with mock.patch(f'{_DM_MODULE}.rocml', rocml_mock, create=True):
assert manager.get_device_power(0) == 123


def test_amd_get_device_power_falls_back_to_current():
"""average_socket_power='N/A' -> fall back to current_socket_power."""
manager = _make_amd_manager()
rocml_mock = mock.Mock()
rocml_mock.amdsmi_get_power_info.return_value = {
'average_socket_power': 'N/A',
'current_socket_power': 321,
'power_limit': 750,
}
with mock.patch(f'{_DM_MODULE}.rocml', rocml_mock, create=True):
assert manager.get_device_power(0) == 321


def test_amd_get_device_power_both_unsupported_returns_none():
"""Both fields non-numeric -> returns None."""
manager = _make_amd_manager()
rocml_mock = mock.Mock()
rocml_mock.amdsmi_get_power_info.return_value = {
'average_socket_power': 'N/A',
'current_socket_power': 'N/A',
'power_limit': 750,
}
with mock.patch(f'{_DM_MODULE}.rocml', rocml_mock, create=True):
assert manager.get_device_power(0) is None


def test_amd_get_device_power_missing_keys_returns_none():
"""Missing keys -> None and warning logged (no exception)."""
manager = _make_amd_manager()
rocml_mock = mock.Mock()
rocml_mock.amdsmi_get_power_info.return_value = {}
with mock.patch(f'{_DM_MODULE}.rocml', rocml_mock, create=True):
assert manager.get_device_power(0) is None


def test_amd_get_device_power_microwatts_converted():
"""average_socket_power reported in µW -> converted to watts.

Verifies the unit handling is symmetric with get_device_power_limit so the
monitor record's gpu_power and gpu_power_limit cannot drift by 1e6.
"""
manager = _make_amd_manager()
rocml_mock = mock.Mock()
rocml_mock.amdsmi_get_power_info.return_value = {
'average_socket_power': 350_000_000, # 350 W in µW
'current_socket_power': 360_000_000,
'power_limit': 750_000_000,
}
with mock.patch(f'{_DM_MODULE}.rocml', rocml_mock, create=True):
assert manager.get_device_power(0) == 350
assert manager.get_device_power_limit(0) == 750

def test_amd_get_device_power_limit_microwatts_converted():
"""power_limit reported in µW (e.g., 750000000) -> converted to 750 W."""
manager = _make_amd_manager()
rocml_mock = mock.Mock()
rocml_mock.amdsmi_get_power_info.return_value = {'power_limit': 750_000_000}
with mock.patch(f'{_DM_MODULE}.rocml', rocml_mock, create=True):
assert manager.get_device_power_limit(0) == 750


def test_amd_get_device_power_limit_watts_passthrough():
"""power_limit already in watts (small value) -> returned as-is."""
manager = _make_amd_manager()
rocml_mock = mock.Mock()
rocml_mock.amdsmi_get_power_info.return_value = {'power_limit': 300}
with mock.patch(f'{_DM_MODULE}.rocml', rocml_mock, create=True):
assert manager.get_device_power_limit(0) == 300


def test_amd_get_device_power_limit_non_numeric_returns_none():
"""power_limit='N/A' -> returns None."""
manager = _make_amd_manager()
rocml_mock = mock.Mock()
rocml_mock.amdsmi_get_power_info.return_value = {'power_limit': 'N/A'}
with mock.patch(f'{_DM_MODULE}.rocml', rocml_mock, create=True):
assert manager.get_device_power_limit(0) is None


def test_amd_get_device_power_limit_missing_key_returns_none():
"""Missing power_limit key -> returns None without raising."""
manager = _make_amd_manager()
rocml_mock = mock.Mock()
rocml_mock.amdsmi_get_power_info.return_value = {}
with mock.patch(f'{_DM_MODULE}.rocml', rocml_mock, create=True):
assert manager.get_device_power_limit(0) is None


def test_amd_device_manager_lifecycle():
"""__init__ calls amdsmi_init/get_processor_handles; __del__ tolerates failures.

Lifecycle is important: a regression in __del__ would surface as noisy
NameError / AttributeError messages in benchmark logs at interpreter shutdown.
"""
rocml_mock = mock.Mock()
rocml_mock.amdsmi_get_processor_handles.return_value = ['h0', 'h1']
with mock.patch(f'{_DM_MODULE}.rocml', rocml_mock, create=True):
manager = dm.AmdDeviceManager()
rocml_mock.amdsmi_init.assert_called_once()
assert manager.get_device_count() == 2
manager.__del__()
rocml_mock.amdsmi_shut_down.assert_called_once()

# Simulate the destructor running when amdsmi has been torn down (e.g.,
# interpreter shutdown). It must swallow the error rather than raise.
manager2 = dm.AmdDeviceManager.__new__(dm.AmdDeviceManager)
manager2._device_handlers = []
bad_rocml = mock.Mock()
bad_rocml.amdsmi_shut_down.side_effect = RuntimeError('rocm gone')
with mock.patch(f'{_DM_MODULE}.rocml', bad_rocml, create=True):
manager2.__del__() # must not raise
Loading