From fdcc55d82fb25641b9039b5e22ea058f5d0be6bc Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Thu, 9 Apr 2026 05:07:26 +0000 Subject: [PATCH 01/20] add ROCm 6.4 support --- dockerfile/rocm6.2.x.dockerfile | 2 +- dockerfile/rocm6.4.x.dockerfile | 214 ++++++++++++++++++ .../gpu_stream/CMakeLists.txt | 72 +++++- 3 files changed, 281 insertions(+), 7 deletions(-) create mode 100644 dockerfile/rocm6.4.x.dockerfile diff --git a/dockerfile/rocm6.2.x.dockerfile b/dockerfile/rocm6.2.x.dockerfile index 0cd86c667..6b37afb77 100644 --- a/dockerfile/rocm6.2.x.dockerfile +++ b/dockerfile/rocm6.2.x.dockerfile @@ -130,7 +130,7 @@ RUN cd /tmp && \ # Install Intel MLC RUN cd /tmp && \ - wget -q https://downloadmirror.intel.com/763324/mlc_v3.10.tgz -O mlc.tgz && \ + wget -q https://downloadmirror.intel.com/866182/mlc_v3.12.tgz -O mlc.tgz && \ tar xzf mlc.tgz Linux/mlc && \ cp ./Linux/mlc /usr/local/bin/ && \ rm -rf ./Linux mlc.tgz diff --git a/dockerfile/rocm6.4.x.dockerfile b/dockerfile/rocm6.4.x.dockerfile new file mode 100644 index 000000000..a0c3e3025 --- /dev/null +++ b/dockerfile/rocm6.4.x.dockerfile @@ -0,0 +1,214 @@ +ARG BASE_IMAGE=rocm/pytorch:rocm6.4.4_ubuntu24.04_py3.12_pytorch_release_2.7.1 + +FROM ${BASE_IMAGE} + +# OS: +# - Ubuntu: 24.04 +# - Docker Client: 20.10.8 +# ROCm: +# - ROCm: 6.4 +# Lib: +# - torch: 2.7.1 +# - rccl: release/rocm-rel-6.4 +# - hipblaslt: release-staging/rocm-rel-6.4 +# - rocblas: release-staging/rocm-rel-6.4 +# - openmpi: 4.1.x +# Intel: +# - mlc: v3.12 + +LABEL maintainer="SuperBench" + +ENV DEBIAN_FRONTEND=noninteractive +RUN apt-get update && \ + apt-get -q install -y --no-install-recommends \ + autoconf \ + automake \ + bc \ + build-essential \ + curl \ + dmidecode \ + git \ + hipify-clang \ + iproute2 \ + jq \ + libaio-dev \ + libboost-program-options-dev \ + libcap2 \ + libcurl4-openssl-dev \ + libnuma-dev \ + libpci-dev \ + libssl-dev \ + libtinfo6 \ + libtool \ + lshw \ + net-tools \ + numactl \ + openssh-client \ + openssh-server \ + pciutils \ + python3-mpi4py \ + rsync \ + sudo \ + util-linux \ + vim \ + wget \ + && \ + rm -rf /tmp/* + +ARG NUM_MAKE_JOBS=64 + +# Check if CMake is installed and its version +RUN cmake_version=$(cmake --version 2>/dev/null | grep -oP "(?<=cmake version )(\d+\.\d+)" || echo "0.0") && \ + required_version="3.24.1" && \ + if [ "$(printf "%s\n" "$required_version" "$cmake_version" | sort -V | head -n 1)" != "$required_version" ]; then \ + echo "existing cmake version is ${cmake_version}" && \ + cd /tmp && \ + wget -q https://github.com/Kitware/CMake/releases/download/v${required_version}/cmake-${required_version}.tar.gz && \ + tar xzf cmake-${required_version}.tar.gz && \ + cd cmake-${required_version} && \ + ./bootstrap --prefix=/usr --no-system-curl --parallel=16 && \ + make -j ${NUM_MAKE_JOBS} && \ + make install && \ + rm -rf /tmp/cmake-${required_version}* \ + else \ + echo "CMake version is greater than or equal to 3.24.1"; \ + fi + +# Install Docker +ENV DOCKER_VERSION=20.10.8 +RUN cd /tmp && \ + wget -q https://download.docker.com/linux/static/stable/x86_64/docker-${DOCKER_VERSION}.tgz -O docker.tgz && \ + tar --extract --file docker.tgz --strip-components 1 --directory /usr/local/bin/ && \ + rm docker.tgz + +# Update system config +RUN mkdir -p /root/.ssh && \ + touch /root/.ssh/authorized_keys && \ + mkdir -p /var/run/sshd && \ + sed -i "s/[# ]*PermitRootLogin prohibit-password/PermitRootLogin yes/" /etc/ssh/sshd_config && \ + sed -i "s/[# ]*PermitUserEnvironment no/PermitUserEnvironment yes/" /etc/ssh/sshd_config && \ + sed -i "s/[# ]*Port.*/Port 22/" /etc/ssh/sshd_config && \ + echo "* soft nofile 1048576\n* hard nofile 1048576" >> /etc/security/limits.conf && \ + echo "root soft nofile 1048576\nroot hard nofile 1048576" >> /etc/security/limits.conf + + +# Set Ubuntu version +ENV UBUNTU_VERSION=24.04 + +# Install OFED +ENV OFED_VERSION=24.10-1.1.4.0 +# Check if ofed_info is present and has a version +RUN if ! command -v ofed_info >/dev/null 2>&1; then \ + echo "OFED not found. Installing OFED..."; \ + cd /tmp && \ + wget -q http://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu${UBUNTU_VERSION}-x86_64.tgz && \ + tar xzf MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu${UBUNTU_VERSION}-x86_64.tgz && \ + PATH=/usr/bin:${PATH} MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu${UBUNTU_VERSION}-x86_64/mlnxofedinstall --user-space-only --without-fw-update --force --all && \ + rm -rf MLNX_OFED_LINUX-${OFED_VERSION}* ; \ + fi + +ENV ROCM_PATH=/opt/rocm + +# Install OpenMPI +ENV OPENMPI_VERSION=4.1.x +ENV MPI_HOME=/usr/local/mpi +# Check if Open MPI is installed +RUN cd /tmp && \ + git clone --recursive https://github.com/open-mpi/ompi.git -b v${OPENMPI_VERSION} && \ + cd ompi && \ + ./autogen.pl && \ + mkdir build && \ + cd build && \ + ../configure --prefix=/usr/local/mpi --enable-orterun-prefix-by-default --enable-mpirun-prefix-by-default --enable-prte-prefix-by-default --with-rocm=/opt/rocm && \ + make -j $(nproc) && \ + make -j $(nproc) install && \ + ldconfig && \ + cd / && \ + rm -rf /tmp/openmpi-${OPENMPI_VERSION}* + +# Install Intel MLC +RUN cd /tmp && \ + wget -q https://downloadmirror.intel.com/866182/mlc_v3.12.tgz -O mlc.tgz && \ + tar xzf mlc.tgz Linux/mlc && \ + cp ./Linux/mlc /usr/local/bin/ && \ + rm -rf ./Linux mlc.tgz + +# Install RCCL +RUN cd /opt/ && \ + git clone -b release/rocm-rel-6.4 https://github.com/ROCmSoftwarePlatform/rccl.git && \ + cd rccl && \ + mkdir build && \ + cd build && \ + CXX=/opt/rocm/bin/hipcc cmake -DHIP_COMPILER=clang -DCMAKE_BUILD_TYPE=Release -DCMAKE_VERBOSE_MAKEFILE=1 \ + -DCMAKE_PREFIX_PATH="${ROCM_PATH}/hsa;${ROCM_PATH}/hip;${ROCM_PATH}/share/rocm/cmake/;${ROCM_PATH}" \ + .. && \ + make -j${NUM_MAKE_JOBS} + +# Install AMD SMI Python Library +RUN apt install amd-smi-lib -y && \ + cd /opt/rocm/share/amd_smi && \ + python3 -m pip install . + +ENV PATH="/usr/local/mpi/bin:/opt/superbench/bin:/usr/local/bin/:/opt/rocm/hip/bin/:/opt/rocm/bin/:${PATH}" \ + LD_PRELOAD="/opt/rccl/build/librccl.so:$LD_PRELOAD" \ + LD_LIBRARY_PATH="/usr/local/mpi/lib:/usr/lib/x86_64-linux-gnu/:/usr/local/lib/:/opt/rocm/lib:${LD_LIBRARY_PATH}" \ + SB_HOME=/opt/superbench \ + SB_MICRO_PATH=/opt/superbench \ + ANSIBLE_DEPRECATION_WARNINGS=FALSE \ + ANSIBLE_COLLECTIONS_PATH=/usr/share/ansible/collections + +RUN echo PATH="$PATH" > /etc/environment && \ + echo LD_LIBRARY_PATH="$LD_LIBRARY_PATH" >> /etc/environment && \ + echo SB_MICRO_PATH="$SB_MICRO_PATH" >> /etc/environment + +RUN apt install rocm-cmake -y && \ + python3 -m pip install --upgrade pip wheel "setuptools>=69.0" + +WORKDIR ${SB_HOME} + +ADD third_party third_party +# perftest_rocm6.patch changes are already upstream in the submodule version +# Build everything except hipblaslt and apex first (apex needs special handling for Python 3.12) +RUN make RCCL_HOME=/opt/rccl/build/ ROCBLAS_BRANCH=release-staging/rocm-rel-6.4 HIPBLASLT_BRANCH=release-staging/rocm-rel-6.4 ROCM_VER=rocm-5.5.0 -C third_party rocm -o cpu_hpl -o cpu_stream -o megatron_lm -o rocm_hipblaslt -o apex_rocm -o rocm_megatron_lm +# Build hipblaslt separately with Tensile target-triple fix for ROCm 6.4 clang +RUN cd third_party && \ + git clone -b release-staging/rocm-rel-6.4 https://github.com/ROCmSoftwarePlatform/hipBLASLt.git && \ + sed -i 's/host-x86_64-unknown-linux,/host-x86_64-unknown-linux-gnu,/' \ + hipBLASLt/tensilelite/Tensile/BuildCommands/SharedCommands.py && \ + cd hipBLASLt && ./install.sh -dc && \ + cp -v build/release/clients/staging/hipblaslt-bench /opt/superbench/bin/ +RUN cp -r /opt/superbench/third_party/hipBLASLt/build/release/hipblaslt-install/lib/* /opt/rocm/lib/ && \ + cp -r /opt/superbench/third_party/hipBLASLt/build/release/hipblaslt-install/include/* /opt/rocm/include/ +RUN cd third_party/Megatron/Megatron-DeepSpeed && \ + git apply ../megatron_deepspeed_rocm6.patch + +# Skip apex for ROCm - it has a fatal double-free bug on Python 3.12 +# (crashes both during install and import). PyTorch 2.7.1 natively +# provides all apex functionality (AMP, fused optimizers, etc.). + +# Install TransformerEngine - pin to 386bd316 (before NVFP4/hip_fp4.h which needs ROCm 7.0+). +# Disable CK fused attention (aiter submodule has gfx950-only code); aotriton stays enabled. +# setup.py crashes with double-free during process exit (static destruction order fiasco +# between torch BuildExtension's HIP runtime and setuptools). Install is fully complete +# before the crash (all .so/.pyc/egg-info installed). || true handles the exit code. +RUN git clone --recursive https://github.com/ROCm/TransformerEngine.git && \ + cd TransformerEngine && \ + git checkout 386bd316 && \ + git submodule update --init --recursive && \ + NVTE_FRAMEWORK=pytorch \ + NVTE_FUSED_ATTN_CK=0 \ + NVTE_ROCM_ARCH=gfx942 \ + python3 setup.py install || true +# Work around HIP static destruction order double-free at process exit. +# jemalloc gracefully handles double-free instead of aborting. +RUN apt-get install -y --no-install-recommends libjemalloc2 +ENV LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so.2:$LD_PRELOAD +# Verify TE import works cleanly with jemalloc +RUN python3 -c "import transformer_engine.pytorch; print('TE installed successfully')" + +ADD . . +ENV USE_HIP_DATATYPE=1 +ENV USE_HIPBLAS_COMPUTETYPE=1 +RUN python3 -m pip install .[amdworker] && \ + CXX=/opt/rocm/bin/hipcc make cppbuild && \ + make postinstall diff --git a/superbench/benchmarks/micro_benchmarks/gpu_stream/CMakeLists.txt b/superbench/benchmarks/micro_benchmarks/gpu_stream/CMakeLists.txt index 2c856f32a..8c374e7dc 100644 --- a/superbench/benchmarks/micro_benchmarks/gpu_stream/CMakeLists.txt +++ b/superbench/benchmarks/micro_benchmarks/gpu_stream/CMakeLists.txt @@ -31,18 +31,78 @@ if(CUDAToolkit_FOUND) target_include_directories(gpu_stream PRIVATE ${CUDAToolkit_INCLUDE_DIRS}) target_link_libraries(gpu_stream numa nvidia-ml) else() - # TODO: test for ROC # ROCm environment include(../rocm_common.cmake) find_package(hip QUIET) if(hip_FOUND) message(STATUS "Found ROCm: " ${HIP_VERSION}) + enable_language(HIP) - # Convert cuda code to hip code in cpp - execute_process(COMMAND hipify-perl -print-stats -o gpu_stream.cpp ${SOURCES} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/) + # Hipify each source and header file individually into the build directory + set(HIPIFY_OUTPUT_DIR ${CMAKE_CURRENT_BINARY_DIR}/hipified) + file(MAKE_DIRECTORY ${HIPIFY_OUTPUT_DIR}) + file(GLOB ALL_GPU_STREAM_SRCS + ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/*.cu + ${CMAKE_CURRENT_SOURCE_DIR}/*.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/*.h + ) + foreach(SRC_FILE ${ALL_GPU_STREAM_SRCS}) + get_filename_component(SRC_NAME ${SRC_FILE} NAME) + # hipify-perl converts CUDA API to HIP API + execute_process( + COMMAND hipify-perl -o ${HIPIFY_OUTPUT_DIR}/${SRC_NAME} ${SRC_FILE} + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + ) + endforeach() + # Remove nvml.h include (NVIDIA-only) from hipified files + # and replace with rocm_smi for memory clock queries + execute_process( + COMMAND sed -i + "s|#include |#include |g" + ${HIPIFY_OUTPUT_DIR}/gpu_stream_utils.hpp + ) + execute_process( + COMMAND sed -i + "s|#include |#include |g" + ${HIPIFY_OUTPUT_DIR}/gpu_stream.cu + ) + # Replace the NVML-based GetActualMemoryClockRate with rocm_smi equivalent + file(WRITE ${HIPIFY_OUTPUT_DIR}/rocm_mem_clock.inc [=[ + // ROCm SMI implementation + rsmi_status_t ret; + ret = rsmi_init(0); + if (ret != RSMI_STATUS_SUCCESS) { + std::cerr << "Failed to initialize ROCm SMI" << std::endl; + return -1.0f; + } + rsmi_frequencies_t freq; + ret = rsmi_dev_gpu_clk_freq_get(gpu_id, RSMI_CLK_TYPE_MEM, &freq); + if (ret != RSMI_STATUS_SUCCESS) { + std::cerr << "Failed to get memory clock" << std::endl; + rsmi_shut_down(); + return -1.0f; + } + float clock_mhz = static_cast(freq.frequency[freq.current] / 1000000); + rsmi_shut_down(); + return clock_mhz; +]=]) + execute_process( + COMMAND sed -i "/nvmlReturn_t/,/return static_cast(clock_mhz);/{ + /nvmlReturn_t/r ${HIPIFY_OUTPUT_DIR}/rocm_mem_clock.inc + d + }" ${HIPIFY_OUTPUT_DIR}/gpu_stream.cu + ) - # link hip device lib - add_executable(gpu_stream gpu_stream.cpp) + set(HIP_SOURCES + ${HIPIFY_OUTPUT_DIR}/gpu_stream_test.cpp + ${HIPIFY_OUTPUT_DIR}/gpu_stream_utils.cpp + ${HIPIFY_OUTPUT_DIR}/gpu_stream.cu + ${HIPIFY_OUTPUT_DIR}/gpu_stream_kernels.cu + ) + set_source_files_properties(${HIP_SOURCES} PROPERTIES LANGUAGE HIP) + add_executable(gpu_stream ${HIP_SOURCES}) + target_include_directories(gpu_stream PRIVATE ${HIPIFY_OUTPUT_DIR}) include(CheckSymbolExists) check_symbol_exists("hipDeviceMallocUncached" "hip/hip_runtime_api.h" HIP_UNCACHED_MEMORY) @@ -51,7 +111,7 @@ else() endif() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2") - target_link_libraries(gpu_stream numa hip::device) + target_link_libraries(gpu_stream numa hip::device rocm_smi64) else() message(FATAL_ERROR "No CUDA or ROCm environment found.") endif() From 77bae81116dbcbd320bd1aeea67e9b6170b47f2f Mon Sep 17 00:00:00 2001 From: Zheyu Shen Date: Fri, 10 Apr 2026 00:26:32 +0000 Subject: [PATCH 02/20] fix additional stuff for rocm 6.4 --- dockerfile/rocm6.4.x.dockerfile | 27 +++++++++++---------------- 1 file changed, 11 insertions(+), 16 deletions(-) diff --git a/dockerfile/rocm6.4.x.dockerfile b/dockerfile/rocm6.4.x.dockerfile index a0c3e3025..676bab136 100644 --- a/dockerfile/rocm6.4.x.dockerfile +++ b/dockerfile/rocm6.4.x.dockerfile @@ -151,7 +151,7 @@ RUN apt install amd-smi-lib -y && \ ENV PATH="/usr/local/mpi/bin:/opt/superbench/bin:/usr/local/bin/:/opt/rocm/hip/bin/:/opt/rocm/bin/:${PATH}" \ LD_PRELOAD="/opt/rccl/build/librccl.so:$LD_PRELOAD" \ - LD_LIBRARY_PATH="/usr/local/mpi/lib:/usr/lib/x86_64-linux-gnu/:/usr/local/lib/:/opt/rocm/lib:${LD_LIBRARY_PATH}" \ + LD_LIBRARY_PATH="/usr/local/mpi/lib:/opt/rocm/lib:/usr/local/lib/:${LD_LIBRARY_PATH}" \ SB_HOME=/opt/superbench \ SB_MICRO_PATH=/opt/superbench \ ANSIBLE_DEPRECATION_WARNINGS=FALSE \ @@ -164,12 +164,18 @@ RUN echo PATH="$PATH" > /etc/environment && \ RUN apt install rocm-cmake -y && \ python3 -m pip install --upgrade pip wheel "setuptools>=69.0" +# Install jemalloc early - prevents glibc double-free aborts during HIP +# static object teardown at process exit (affects setup.py builds that +# import torch's BuildExtension, which initializes the HIP runtime). +RUN apt-get install -y --no-install-recommends libjemalloc2 +ENV LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so.2:$LD_PRELOAD + WORKDIR ${SB_HOME} ADD third_party third_party # perftest_rocm6.patch changes are already upstream in the submodule version -# Build everything except hipblaslt and apex first (apex needs special handling for Python 3.12) -RUN make RCCL_HOME=/opt/rccl/build/ ROCBLAS_BRANCH=release-staging/rocm-rel-6.4 HIPBLASLT_BRANCH=release-staging/rocm-rel-6.4 ROCM_VER=rocm-5.5.0 -C third_party rocm -o cpu_hpl -o cpu_stream -o megatron_lm -o rocm_hipblaslt -o apex_rocm -o rocm_megatron_lm +# rocm_megatron_lm is broken upstream (pretrain_deepseek.py doesn't exist in rocm_dev branch) +RUN make RCCL_HOME=/opt/rccl/build/ ROCBLAS_BRANCH=release-staging/rocm-rel-6.4 HIPBLASLT_BRANCH=release-staging/rocm-rel-6.4 ROCM_VER=rocm-5.5.0 -C third_party rocm -o cpu_hpl -o cpu_stream -o megatron_lm -o rocm_hipblaslt -o rocm_megatron_lm # Build hipblaslt separately with Tensile target-triple fix for ROCm 6.4 clang RUN cd third_party && \ git clone -b release-staging/rocm-rel-6.4 https://github.com/ROCmSoftwarePlatform/hipBLASLt.git && \ @@ -182,15 +188,9 @@ RUN cp -r /opt/superbench/third_party/hipBLASLt/build/release/hipblaslt-install/ RUN cd third_party/Megatron/Megatron-DeepSpeed && \ git apply ../megatron_deepspeed_rocm6.patch -# Skip apex for ROCm - it has a fatal double-free bug on Python 3.12 -# (crashes both during install and import). PyTorch 2.7.1 natively -# provides all apex functionality (AMP, fused optimizers, etc.). - # Install TransformerEngine - pin to 386bd316 (before NVFP4/hip_fp4.h which needs ROCm 7.0+). # Disable CK fused attention (aiter submodule has gfx950-only code); aotriton stays enabled. -# setup.py crashes with double-free during process exit (static destruction order fiasco -# between torch BuildExtension's HIP runtime and setuptools). Install is fully complete -# before the crash (all .so/.pyc/egg-info installed). || true handles the exit code. +# --no-build-isolation so torch (with IS_HIP_EXTENSION=True) is visible during build. RUN git clone --recursive https://github.com/ROCm/TransformerEngine.git && \ cd TransformerEngine && \ git checkout 386bd316 && \ @@ -198,12 +198,7 @@ RUN git clone --recursive https://github.com/ROCm/TransformerEngine.git && \ NVTE_FRAMEWORK=pytorch \ NVTE_FUSED_ATTN_CK=0 \ NVTE_ROCM_ARCH=gfx942 \ - python3 setup.py install || true -# Work around HIP static destruction order double-free at process exit. -# jemalloc gracefully handles double-free instead of aborting. -RUN apt-get install -y --no-install-recommends libjemalloc2 -ENV LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so.2:$LD_PRELOAD -# Verify TE import works cleanly with jemalloc + pip install --no-build-isolation . RUN python3 -c "import transformer_engine.pytorch; print('TE installed successfully')" ADD . . From 6d6a426ccdcc362b08c2f802ae2348829869f899 Mon Sep 17 00:00:00 2001 From: Zheyu Shen Date: Sun, 12 Apr 2026 09:16:12 +0000 Subject: [PATCH 03/20] fix additional stuff for rocm 6.4 --- dockerfile/rocm6.4.x.dockerfile | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/dockerfile/rocm6.4.x.dockerfile b/dockerfile/rocm6.4.x.dockerfile index 676bab136..fd62ffe17 100644 --- a/dockerfile/rocm6.4.x.dockerfile +++ b/dockerfile/rocm6.4.x.dockerfile @@ -150,8 +150,7 @@ RUN apt install amd-smi-lib -y && \ python3 -m pip install . ENV PATH="/usr/local/mpi/bin:/opt/superbench/bin:/usr/local/bin/:/opt/rocm/hip/bin/:/opt/rocm/bin/:${PATH}" \ - LD_PRELOAD="/opt/rccl/build/librccl.so:$LD_PRELOAD" \ - LD_LIBRARY_PATH="/usr/local/mpi/lib:/opt/rocm/lib:/usr/local/lib/:${LD_LIBRARY_PATH}" \ + LD_LIBRARY_PATH="/opt/rccl/build:/usr/local/mpi/lib:/opt/rocm/lib:/usr/local/lib/:${LD_LIBRARY_PATH}" \ SB_HOME=/opt/superbench \ SB_MICRO_PATH=/opt/superbench \ ANSIBLE_DEPRECATION_WARNINGS=FALSE \ @@ -164,24 +163,28 @@ RUN echo PATH="$PATH" > /etc/environment && \ RUN apt install rocm-cmake -y && \ python3 -m pip install --upgrade pip wheel "setuptools>=69.0" -# Install jemalloc early - prevents glibc double-free aborts during HIP -# static object teardown at process exit (affects setup.py builds that -# import torch's BuildExtension, which initializes the HIP runtime). -RUN apt-get install -y --no-install-recommends libjemalloc2 -ENV LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so.2:$LD_PRELOAD - WORKDIR ${SB_HOME} ADD third_party third_party # perftest_rocm6.patch changes are already upstream in the submodule version -# rocm_megatron_lm is broken upstream (pretrain_deepseek.py doesn't exist in rocm_dev branch) -RUN make RCCL_HOME=/opt/rccl/build/ ROCBLAS_BRANCH=release-staging/rocm-rel-6.4 HIPBLASLT_BRANCH=release-staging/rocm-rel-6.4 ROCM_VER=rocm-5.5.0 -C third_party rocm -o cpu_hpl -o cpu_stream -o megatron_lm -o rocm_hipblaslt -o rocm_megatron_lm +# rocm_megatron_lm: broken upstream (pretrain_deepseek.py missing in rocm_dev branch) +# apex_rocm: skipped — all apex imports in Megatron-DeepSpeed are guarded with try/except, +# superbench has zero direct apex usage, and PyTorch 2.7 has native fused optimizers/AMP. +RUN make RCCL_HOME=/opt/rccl/build/ ROCBLAS_BRANCH=release-staging/rocm-rel-6.4 HIPBLASLT_BRANCH=release-staging/rocm-rel-6.4 ROCM_VER=rocm-5.5.0 -C third_party rocm -o cpu_hpl -o cpu_stream -o megatron_lm -o rocm_hipblaslt -o rocm_megatron_lm -o apex_rocm # Build hipblaslt separately with Tensile target-triple fix for ROCm 6.4 clang +# Also fix joblib race condition (github.com/joblib/joblib/issues/1788) in Python 3.12: +# joblib's _retrieve() iterates _jobs_set while callbacks modify it. +# Fix: copy the set before iterating. Patch all joblib instances system-wide. +RUN pip install "joblib>=1.4.2" && \ + find / -path '*/joblib/parallel.py' -not -path '*/.git/*' -exec sed -i \ + 's/timeout_control_job = next(iter(self\._jobs_set), None)/timeout_control_job = next(iter(set(self._jobs_set)), None)/' {} + RUN cd third_party && \ git clone -b release-staging/rocm-rel-6.4 https://github.com/ROCmSoftwarePlatform/hipBLASLt.git && \ sed -i 's/host-x86_64-unknown-linux,/host-x86_64-unknown-linux-gnu,/' \ hipBLASLt/tensilelite/Tensile/BuildCommands/SharedCommands.py && \ cd hipBLASLt && ./install.sh -dc && \ + find /opt -path '*/joblib/parallel.py' -not -path '*/.git/*' -exec sed -i \ + 's/timeout_control_job = next(iter(self\._jobs_set), None)/timeout_control_job = next(iter(set(self._jobs_set)), None)/' {} + && \ cp -v build/release/clients/staging/hipblaslt-bench /opt/superbench/bin/ RUN cp -r /opt/superbench/third_party/hipBLASLt/build/release/hipblaslt-install/lib/* /opt/rocm/lib/ && \ cp -r /opt/superbench/third_party/hipBLASLt/build/release/hipblaslt-install/include/* /opt/rocm/include/ @@ -190,7 +193,6 @@ RUN cd third_party/Megatron/Megatron-DeepSpeed && \ # Install TransformerEngine - pin to 386bd316 (before NVFP4/hip_fp4.h which needs ROCm 7.0+). # Disable CK fused attention (aiter submodule has gfx950-only code); aotriton stays enabled. -# --no-build-isolation so torch (with IS_HIP_EXTENSION=True) is visible during build. RUN git clone --recursive https://github.com/ROCm/TransformerEngine.git && \ cd TransformerEngine && \ git checkout 386bd316 && \ @@ -198,7 +200,7 @@ RUN git clone --recursive https://github.com/ROCm/TransformerEngine.git && \ NVTE_FRAMEWORK=pytorch \ NVTE_FUSED_ATTN_CK=0 \ NVTE_ROCM_ARCH=gfx942 \ - pip install --no-build-isolation . + python3 setup.py install RUN python3 -c "import transformer_engine.pytorch; print('TE installed successfully')" ADD . . From 0fdd70355ae99cc91be78ae8ad0f72f230ff7037 Mon Sep 17 00:00:00 2001 From: Zheyu Shen Date: Mon, 13 Apr 2026 01:31:19 +0000 Subject: [PATCH 04/20] initial rocm 7.0 and 7.2 --- dockerfile/rocm7.0.x.dockerfile | 209 +++++++++++++++++++++++++++++++ dockerfile/rocm7.2.x.dockerfile | 211 ++++++++++++++++++++++++++++++++ 2 files changed, 420 insertions(+) create mode 100644 dockerfile/rocm7.0.x.dockerfile create mode 100644 dockerfile/rocm7.2.x.dockerfile diff --git a/dockerfile/rocm7.0.x.dockerfile b/dockerfile/rocm7.0.x.dockerfile new file mode 100644 index 000000000..9d7bd3377 --- /dev/null +++ b/dockerfile/rocm7.0.x.dockerfile @@ -0,0 +1,209 @@ +ARG BASE_IMAGE=rocm/pytorch:rocm7.0.2_ubuntu24.04_py3.12_pytorch_release_2.9.1 + +FROM ${BASE_IMAGE} + +# OS: +# - Ubuntu: 24.04 +# - Docker Client: 20.10.8 +# ROCm: +# - ROCm: 7.0 +# Lib: +# - torch: 2.9.1 +# - rccl: release/rocm-rel-7.0 +# - hipblaslt: release-staging/rocm-rel-7.0 +# - rocblas: release-staging/rocm-rel-7.0 +# - openmpi: 4.1.x +# Intel: +# - mlc: v3.12 + +LABEL maintainer="SuperBench" + +ENV DEBIAN_FRONTEND=noninteractive +RUN apt-get update && \ + apt-get -q install -y --no-install-recommends \ + autoconf \ + automake \ + bc \ + build-essential \ + curl \ + dmidecode \ + git \ + hipify-clang \ + iproute2 \ + jq \ + libaio-dev \ + libboost-program-options-dev \ + libcap2 \ + libcurl4-openssl-dev \ + libnuma-dev \ + libpci-dev \ + libssl-dev \ + libtinfo6 \ + libtool \ + lshw \ + net-tools \ + numactl \ + openssh-client \ + openssh-server \ + pciutils \ + python3-mpi4py \ + rsync \ + sudo \ + util-linux \ + vim \ + wget \ + && \ + rm -rf /tmp/* + +ARG NUM_MAKE_JOBS=64 + +# Check if CMake is installed and its version +RUN cmake_version=$(cmake --version 2>/dev/null | grep -oP "(?<=cmake version )(\d+\.\d+)" || echo "0.0") && \ + required_version="3.24.1" && \ + if [ "$(printf "%s\n" "$required_version" "$cmake_version" | sort -V | head -n 1)" != "$required_version" ]; then \ + echo "existing cmake version is ${cmake_version}" && \ + cd /tmp && \ + wget -q https://github.com/Kitware/CMake/releases/download/v${required_version}/cmake-${required_version}.tar.gz && \ + tar xzf cmake-${required_version}.tar.gz && \ + cd cmake-${required_version} && \ + ./bootstrap --prefix=/usr --no-system-curl --parallel=16 && \ + make -j ${NUM_MAKE_JOBS} && \ + make install && \ + rm -rf /tmp/cmake-${required_version}* \ + else \ + echo "CMake version is greater than or equal to 3.24.1"; \ + fi + +# Install Docker +ENV DOCKER_VERSION=20.10.8 +RUN cd /tmp && \ + wget -q https://download.docker.com/linux/static/stable/x86_64/docker-${DOCKER_VERSION}.tgz -O docker.tgz && \ + tar --extract --file docker.tgz --strip-components 1 --directory /usr/local/bin/ && \ + rm docker.tgz + +# Update system config +RUN mkdir -p /root/.ssh && \ + touch /root/.ssh/authorized_keys && \ + mkdir -p /var/run/sshd && \ + sed -i "s/[# ]*PermitRootLogin prohibit-password/PermitRootLogin yes/" /etc/ssh/sshd_config && \ + sed -i "s/[# ]*PermitUserEnvironment no/PermitUserEnvironment yes/" /etc/ssh/sshd_config && \ + sed -i "s/[# ]*Port.*/Port 22/" /etc/ssh/sshd_config && \ + echo "* soft nofile 1048576\n* hard nofile 1048576" >> /etc/security/limits.conf && \ + echo "root soft nofile 1048576\nroot hard nofile 1048576" >> /etc/security/limits.conf + + +# Set Ubuntu version +ENV UBUNTU_VERSION=24.04 + +# Install OFED +ENV OFED_VERSION=24.10-1.1.4.0 +# Check if ofed_info is present and has a version +RUN if ! command -v ofed_info >/dev/null 2>&1; then \ + echo "OFED not found. Installing OFED..."; \ + cd /tmp && \ + wget -q http://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu${UBUNTU_VERSION}-x86_64.tgz && \ + tar xzf MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu${UBUNTU_VERSION}-x86_64.tgz && \ + PATH=/usr/bin:${PATH} MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu${UBUNTU_VERSION}-x86_64/mlnxofedinstall --user-space-only --without-fw-update --force --all && \ + rm -rf MLNX_OFED_LINUX-${OFED_VERSION}* ; \ + fi + +ENV ROCM_PATH=/opt/rocm + +# Install OpenMPI +ENV OPENMPI_VERSION=4.1.x +ENV MPI_HOME=/usr/local/mpi +RUN cd /tmp && \ + git clone --recursive https://github.com/open-mpi/ompi.git -b v${OPENMPI_VERSION} && \ + cd ompi && \ + ./autogen.pl && \ + mkdir build && \ + cd build && \ + ../configure --prefix=/usr/local/mpi --enable-orterun-prefix-by-default --enable-mpirun-prefix-by-default --enable-prte-prefix-by-default --with-rocm=/opt/rocm && \ + make -j $(nproc) && \ + make -j $(nproc) install && \ + ldconfig && \ + cd / && \ + rm -rf /tmp/openmpi-${OPENMPI_VERSION}* + +# Install Intel MLC +RUN cd /tmp && \ + wget -q https://downloadmirror.intel.com/866182/mlc_v3.12.tgz -O mlc.tgz && \ + tar xzf mlc.tgz Linux/mlc && \ + cp ./Linux/mlc /usr/local/bin/ && \ + rm -rf ./Linux mlc.tgz + +# Install RCCL +# Set CMAKE_POLICY_VERSION_MINIMUM globally so all subprojects (mscclpp, etc.) +# work with CMake 4.0+ which dropped compat for cmake_minimum_required < 3.5 +ENV CMAKE_POLICY_VERSION_MINIMUM=3.5 +RUN cd /opt/ && \ + git clone -b release/rocm-rel-7.0 https://github.com/ROCmSoftwarePlatform/rccl.git && \ + cd rccl && \ + mkdir build && \ + cd build && \ + CXX=/opt/rocm/bin/hipcc cmake -DHIP_COMPILER=clang -DCMAKE_BUILD_TYPE=Release -DCMAKE_VERBOSE_MAKEFILE=1 \ + -DCMAKE_PREFIX_PATH="${ROCM_PATH}/hsa;${ROCM_PATH}/hip;${ROCM_PATH}/share/rocm/cmake/;${ROCM_PATH}" \ + .. && \ + make -j${NUM_MAKE_JOBS} + +# Install AMD SMI Python Library +RUN apt install amd-smi-lib -y && \ + cd /opt/rocm/share/amd_smi && \ + python3 -m pip install . + +# Note: Do NOT LD_PRELOAD librccl.so — it causes segfaults on process exit +# due to HIP static object teardown order. Use LD_LIBRARY_PATH instead. +ENV PATH="/usr/local/mpi/bin:/opt/superbench/bin:/usr/local/bin/:/opt/rocm/hip/bin/:/opt/rocm/bin/:${PATH}" \ + LD_LIBRARY_PATH="/opt/rccl/build:/usr/local/mpi/lib:/opt/rocm/lib:/usr/local/lib/:${LD_LIBRARY_PATH}" \ + SB_HOME=/opt/superbench \ + SB_MICRO_PATH=/opt/superbench \ + ANSIBLE_DEPRECATION_WARNINGS=FALSE \ + ANSIBLE_COLLECTIONS_PATH=/usr/share/ansible/collections + +RUN echo PATH="$PATH" > /etc/environment && \ + echo LD_LIBRARY_PATH="$LD_LIBRARY_PATH" >> /etc/environment && \ + echo SB_MICRO_PATH="$SB_MICRO_PATH" >> /etc/environment + +RUN apt install rocm-cmake -y && \ + python3 -m pip install --upgrade pip wheel "setuptools>=69.0" + +WORKDIR ${SB_HOME} + +ADD third_party third_party +# perftest_rocm6.patch changes are already upstream in the submodule version +# rocm_megatron_lm: broken upstream (pretrain_deepseek.py missing in rocm_dev branch) +# apex_rocm: skipped — all imports guarded, PyTorch 2.9 has native fused optimizers/AMP. +RUN make RCCL_HOME=/opt/rccl/build/ ROCBLAS_BRANCH=release-staging/rocm-rel-7.0 HIPBLASLT_BRANCH=release-staging/rocm-rel-7.0 ROCM_VER=rocm-5.5.0 -C third_party rocm -o cpu_hpl -o cpu_stream -o megatron_lm -o rocm_hipblaslt -o rocm_megatron_lm -o apex_rocm +# Build hipblaslt separately with Tensile target-triple fix for clang +# Fix joblib race condition (github.com/joblib/joblib/issues/1788) for Python 3.12 +RUN pip install "joblib>=1.4.2" && \ + find / -path '*/joblib/parallel.py' -not -path '*/.git/*' -exec sed -i \ + 's/timeout_control_job = next(iter(self\._jobs_set), None)/timeout_control_job = next(iter(set(self._jobs_set)), None)/' {} + +RUN cd third_party && \ + git clone -b release-staging/rocm-rel-7.0 https://github.com/ROCmSoftwarePlatform/hipBLASLt.git && \ + sed -i 's/host-x86_64-unknown-linux,/host-x86_64-unknown-linux-gnu,/' \ + hipBLASLt/tensilelite/Tensile/BuildCommands/SharedCommands.py && \ + cd hipBLASLt && ./install.sh -dc && \ + find /opt -path '*/joblib/parallel.py' -not -path '*/.git/*' -exec sed -i \ + 's/timeout_control_job = next(iter(self\._jobs_set), None)/timeout_control_job = next(iter(set(self._jobs_set)), None)/' {} + && \ + cp -v build/release/clients/staging/hipblaslt-bench /opt/superbench/bin/ +RUN cp -r /opt/superbench/third_party/hipBLASLt/build/release/hipblaslt-install/lib/* /opt/rocm/lib/ && \ + cp -r /opt/superbench/third_party/hipBLASLt/build/release/hipblaslt-install/include/* /opt/rocm/include/ +RUN cd third_party/Megatron/Megatron-DeepSpeed && \ + git apply ../megatron_deepspeed_rocm6.patch + +# Install TransformerEngine — ROCm 7.0 has hip_fp4.h and gfx950 support, +# so we can use the latest dev branch with full CK fused attention. +RUN git clone --recursive https://github.com/ROCm/TransformerEngine.git && \ + cd TransformerEngine && \ + NVTE_FRAMEWORK=pytorch \ + NVTE_ROCM_ARCH="gfx942;gfx950" \ + python3 setup.py install +RUN python3 -c "import transformer_engine.pytorch; print('TE installed successfully')" + +ADD . . +ENV USE_HIP_DATATYPE=1 +ENV USE_HIPBLAS_COMPUTETYPE=1 +RUN python3 -m pip install .[amdworker] && \ + CXX=/opt/rocm/bin/hipcc make cppbuild && \ + make postinstall diff --git a/dockerfile/rocm7.2.x.dockerfile b/dockerfile/rocm7.2.x.dockerfile new file mode 100644 index 000000000..0ca7b2d51 --- /dev/null +++ b/dockerfile/rocm7.2.x.dockerfile @@ -0,0 +1,211 @@ +ARG BASE_IMAGE=rocm/pytorch:rocm7.2.1_ubuntu24.04_py3.12_pytorch_release_2.9.1 + +FROM ${BASE_IMAGE} + +# OS: +# - Ubuntu: 24.04 +# - Docker Client: 20.10.8 +# ROCm: +# - ROCm: 7.2 +# Lib: +# - torch: 2.9.1 +# - rccl: release/rocm-rel-7.2 +# - hipblaslt: release-staging/rocm-rel-7.2 +# - rocblas: release-staging/rocm-rel-7.2 +# - openmpi: 4.1.x +# Intel: +# - mlc: v3.12 + +LABEL maintainer="SuperBench" + +ENV DEBIAN_FRONTEND=noninteractive +RUN apt-get update && \ + apt-get -q install -y --no-install-recommends \ + autoconf \ + automake \ + bc \ + build-essential \ + curl \ + dmidecode \ + git \ + hipify-clang \ + iproute2 \ + jq \ + libaio-dev \ + libboost-program-options-dev \ + libcap2 \ + libcurl4-openssl-dev \ + libnuma-dev \ + libpci-dev \ + libssl-dev \ + libtinfo6 \ + libtool \ + lshw \ + net-tools \ + numactl \ + openssh-client \ + openssh-server \ + pciutils \ + python3-mpi4py \ + rsync \ + sudo \ + util-linux \ + vim \ + wget \ + && \ + rm -rf /tmp/* + +ARG NUM_MAKE_JOBS=64 + +# Check if CMake is installed and its version +RUN cmake_version=$(cmake --version 2>/dev/null | grep -oP "(?<=cmake version )(\d+\.\d+)" || echo "0.0") && \ + required_version="3.24.1" && \ + if [ "$(printf "%s\n" "$required_version" "$cmake_version" | sort -V | head -n 1)" != "$required_version" ]; then \ + echo "existing cmake version is ${cmake_version}" && \ + cd /tmp && \ + wget -q https://github.com/Kitware/CMake/releases/download/v${required_version}/cmake-${required_version}.tar.gz && \ + tar xzf cmake-${required_version}.tar.gz && \ + cd cmake-${required_version} && \ + ./bootstrap --prefix=/usr --no-system-curl --parallel=16 && \ + make -j ${NUM_MAKE_JOBS} && \ + make install && \ + rm -rf /tmp/cmake-${required_version}* \ + else \ + echo "CMake version is greater than or equal to 3.24.1"; \ + fi + +# Install Docker +ENV DOCKER_VERSION=20.10.8 +RUN cd /tmp && \ + wget -q https://download.docker.com/linux/static/stable/x86_64/docker-${DOCKER_VERSION}.tgz -O docker.tgz && \ + tar --extract --file docker.tgz --strip-components 1 --directory /usr/local/bin/ && \ + rm docker.tgz + +# Update system config +RUN mkdir -p /root/.ssh && \ + touch /root/.ssh/authorized_keys && \ + mkdir -p /var/run/sshd && \ + sed -i "s/[# ]*PermitRootLogin prohibit-password/PermitRootLogin yes/" /etc/ssh/sshd_config && \ + sed -i "s/[# ]*PermitUserEnvironment no/PermitUserEnvironment yes/" /etc/ssh/sshd_config && \ + sed -i "s/[# ]*Port.*/Port 22/" /etc/ssh/sshd_config && \ + echo "* soft nofile 1048576\n* hard nofile 1048576" >> /etc/security/limits.conf && \ + echo "root soft nofile 1048576\nroot hard nofile 1048576" >> /etc/security/limits.conf + + +# Set Ubuntu version +ENV UBUNTU_VERSION=24.04 + +# Install OFED +ENV OFED_VERSION=24.10-1.1.4.0 +RUN if ! command -v ofed_info >/dev/null 2>&1; then \ + echo "OFED not found. Installing OFED..."; \ + cd /tmp && \ + wget -q http://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu${UBUNTU_VERSION}-x86_64.tgz && \ + tar xzf MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu${UBUNTU_VERSION}-x86_64.tgz && \ + PATH=/usr/bin:${PATH} MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu${UBUNTU_VERSION}-x86_64/mlnxofedinstall --user-space-only --without-fw-update --force --all && \ + rm -rf MLNX_OFED_LINUX-${OFED_VERSION}* ; \ + fi + +ENV ROCM_PATH=/opt/rocm + +# CMake 4.0+ dropped compat for cmake_minimum_required < 3.5. +# Set globally so all subprojects (RCCL, mscclpp, etc.) work. +ENV CMAKE_POLICY_VERSION_MINIMUM=3.5 + +# Install OpenMPI +ENV OPENMPI_VERSION=4.1.x +ENV MPI_HOME=/usr/local/mpi +RUN cd /tmp && \ + git clone --recursive https://github.com/open-mpi/ompi.git -b v${OPENMPI_VERSION} && \ + cd ompi && \ + ./autogen.pl && \ + mkdir build && \ + cd build && \ + ../configure --prefix=/usr/local/mpi --enable-orterun-prefix-by-default --enable-mpirun-prefix-by-default --enable-prte-prefix-by-default --with-rocm=/opt/rocm && \ + make -j $(nproc) && \ + make -j $(nproc) install && \ + ldconfig && \ + cd / && \ + rm -rf /tmp/openmpi-${OPENMPI_VERSION}* + +# Install Intel MLC +RUN cd /tmp && \ + wget -q https://downloadmirror.intel.com/866182/mlc_v3.12.tgz -O mlc.tgz && \ + tar xzf mlc.tgz Linux/mlc && \ + cp ./Linux/mlc /usr/local/bin/ && \ + rm -rf ./Linux mlc.tgz + +# Install RCCL +RUN cd /opt/ && \ + git clone -b release/rocm-rel-7.2 https://github.com/ROCmSoftwarePlatform/rccl.git && \ + cd rccl && \ + mkdir build && \ + cd build && \ + CXX=/opt/rocm/bin/hipcc cmake -DHIP_COMPILER=clang -DCMAKE_BUILD_TYPE=Release -DCMAKE_VERBOSE_MAKEFILE=1 \ + -DCMAKE_PREFIX_PATH="${ROCM_PATH}/hsa;${ROCM_PATH}/hip;${ROCM_PATH}/share/rocm/cmake/;${ROCM_PATH}" \ + .. && \ + make -j${NUM_MAKE_JOBS} + +# Install AMD SMI Python Library +RUN apt install amd-smi-lib -y && \ + cd /opt/rocm/share/amd_smi && \ + python3 -m pip install . + +# Do NOT LD_PRELOAD librccl.so — causes segfaults on process exit due to +# HIP static object teardown order. Use LD_LIBRARY_PATH instead. +# Do NOT put /usr/lib/x86_64-linux-gnu/ before /opt/rocm/lib — OFED installs +# an old libhsa-runtime64.so there that conflicts with ROCm's version. +ENV PATH="/usr/local/mpi/bin:/opt/superbench/bin:/usr/local/bin/:/opt/rocm/hip/bin/:/opt/rocm/bin/:${PATH}" \ + LD_LIBRARY_PATH="/opt/rccl/build:/usr/local/mpi/lib:/opt/rocm/lib:/usr/local/lib/:${LD_LIBRARY_PATH}" \ + SB_HOME=/opt/superbench \ + SB_MICRO_PATH=/opt/superbench \ + ANSIBLE_DEPRECATION_WARNINGS=FALSE \ + ANSIBLE_COLLECTIONS_PATH=/usr/share/ansible/collections + +RUN echo PATH="$PATH" > /etc/environment && \ + echo LD_LIBRARY_PATH="$LD_LIBRARY_PATH" >> /etc/environment && \ + echo SB_MICRO_PATH="$SB_MICRO_PATH" >> /etc/environment + +RUN apt install rocm-cmake -y && \ + python3 -m pip install --upgrade pip wheel "setuptools>=69.0" + +WORKDIR ${SB_HOME} + +ADD third_party third_party +# perftest_rocm6.patch changes are already upstream in the submodule version +# rocm_megatron_lm: broken upstream (pretrain_deepseek.py missing in rocm_dev branch) +# apex_rocm: skipped — all imports guarded, PyTorch 2.9 has native fused optimizers/AMP. +RUN make RCCL_HOME=/opt/rccl/build/ ROCBLAS_BRANCH=release-staging/rocm-rel-7.2 HIPBLASLT_BRANCH=release-staging/rocm-rel-7.2 ROCM_VER=rocm-5.5.0 -C third_party rocm -o cpu_hpl -o cpu_stream -o megatron_lm -o rocm_hipblaslt -o rocm_megatron_lm -o apex_rocm +# Build hipblaslt separately with Tensile target-triple fix for clang +# Fix joblib race condition (github.com/joblib/joblib/issues/1788) for Python 3.12 +RUN pip install "joblib>=1.4.2" && \ + find / -path '*/joblib/parallel.py' -not -path '*/.git/*' -exec sed -i \ + 's/timeout_control_job = next(iter(self\._jobs_set), None)/timeout_control_job = next(iter(set(self._jobs_set)), None)/' {} + +RUN cd third_party && \ + git clone -b release-staging/rocm-rel-7.2 https://github.com/ROCmSoftwarePlatform/hipBLASLt.git && \ + sed -i 's/host-x86_64-unknown-linux,/host-x86_64-unknown-linux-gnu,/' \ + hipBLASLt/tensilelite/Tensile/BuildCommands/SharedCommands.py && \ + cd hipBLASLt && ./install.sh -dc && \ + find /opt -path '*/joblib/parallel.py' -not -path '*/.git/*' -exec sed -i \ + 's/timeout_control_job = next(iter(self\._jobs_set), None)/timeout_control_job = next(iter(set(self._jobs_set)), None)/' {} + && \ + cp -v build/release/clients/staging/hipblaslt-bench /opt/superbench/bin/ +RUN cp -r /opt/superbench/third_party/hipBLASLt/build/release/hipblaslt-install/lib/* /opt/rocm/lib/ && \ + cp -r /opt/superbench/third_party/hipBLASLt/build/release/hipblaslt-install/include/* /opt/rocm/include/ +RUN cd third_party/Megatron/Megatron-DeepSpeed && \ + git apply ../megatron_deepspeed_rocm6.patch + +# Install TransformerEngine — ROCm 7.2 has hip_fp4.h and gfx950 support, +# so we can use the latest dev branch with full CK + aotriton fused attention. +RUN git clone --recursive https://github.com/ROCm/TransformerEngine.git && \ + cd TransformerEngine && \ + NVTE_FRAMEWORK=pytorch \ + NVTE_ROCM_ARCH="gfx942;gfx950" \ + python3 setup.py install +RUN python3 -c "import transformer_engine.pytorch; print('TE installed successfully')" + +ADD . . +ENV USE_HIP_DATATYPE=1 +ENV USE_HIPBLAS_COMPUTETYPE=1 +RUN python3 -m pip install .[amdworker] && \ + CXX=/opt/rocm/bin/hipcc make cppbuild && \ + make postinstall From cebeacf0e22d6570fe28cfb01f8c5874721239ab Mon Sep 17 00:00:00 2001 From: Zheyu Shen Date: Mon, 13 Apr 2026 21:48:17 +0000 Subject: [PATCH 05/20] fix more stuff --- dockerfile/rocm6.4.x.dockerfile | 4 ++++ dockerfile/rocm7.0.x.dockerfile | 4 ++++ dockerfile/rocm7.2.x.dockerfile | 4 ++++ third_party/rccl-tests | 2 +- 4 files changed, 13 insertions(+), 1 deletion(-) diff --git a/dockerfile/rocm6.4.x.dockerfile b/dockerfile/rocm6.4.x.dockerfile index fd62ffe17..4c080ce50 100644 --- a/dockerfile/rocm6.4.x.dockerfile +++ b/dockerfile/rocm6.4.x.dockerfile @@ -209,3 +209,7 @@ ENV USE_HIPBLAS_COMPUTETYPE=1 RUN python3 -m pip install .[amdworker] && \ CXX=/opt/rocm/bin/hipcc make cppbuild && \ make postinstall + +# Fix stale hypothesis plugin from base image (imports removed pkg_resources) +# and add test dependencies missing from the base image. +RUN python3 -m pip install --upgrade hypothesis setuptools pytest-timeout vcrpy diff --git a/dockerfile/rocm7.0.x.dockerfile b/dockerfile/rocm7.0.x.dockerfile index 9d7bd3377..d8f37fbe2 100644 --- a/dockerfile/rocm7.0.x.dockerfile +++ b/dockerfile/rocm7.0.x.dockerfile @@ -207,3 +207,7 @@ ENV USE_HIPBLAS_COMPUTETYPE=1 RUN python3 -m pip install .[amdworker] && \ CXX=/opt/rocm/bin/hipcc make cppbuild && \ make postinstall + +# Fix stale hypothesis plugin from base image (imports removed pkg_resources) +# and add test dependencies missing from the base image. +RUN python3 -m pip install --upgrade hypothesis setuptools pytest-timeout vcrpy diff --git a/dockerfile/rocm7.2.x.dockerfile b/dockerfile/rocm7.2.x.dockerfile index 0ca7b2d51..dc7068b2d 100644 --- a/dockerfile/rocm7.2.x.dockerfile +++ b/dockerfile/rocm7.2.x.dockerfile @@ -209,3 +209,7 @@ ENV USE_HIPBLAS_COMPUTETYPE=1 RUN python3 -m pip install .[amdworker] && \ CXX=/opt/rocm/bin/hipcc make cppbuild && \ make postinstall + +# Fix stale hypothesis plugin from base image (imports removed pkg_resources) +# and add test dependencies missing from the base image. +RUN python3 -m pip install --upgrade hypothesis setuptools pytest-timeout vcrpy diff --git a/third_party/rccl-tests b/third_party/rccl-tests index 46375b1c5..0039629ac 160000 --- a/third_party/rccl-tests +++ b/third_party/rccl-tests @@ -1 +1 @@ -Subproject commit 46375b1c527b2e3afe80fdd6dd136151bd939675 +Subproject commit 0039629ac529ac9951ec9df5e243ed76c4cfb060 From 96ec635357c66ad34ee422827be2694c3f95b201 Mon Sep 17 00:00:00 2001 From: Zheyu Shen Date: Wed, 15 Apr 2026 01:30:59 +0000 Subject: [PATCH 06/20] fix --- dockerfile/rocm6.4.x.dockerfile | 2 +- dockerfile/rocm7.0.x.dockerfile | 26 +++++++------------------- dockerfile/rocm7.2.x.dockerfile | 30 +++++++++--------------------- 3 files changed, 17 insertions(+), 41 deletions(-) diff --git a/dockerfile/rocm6.4.x.dockerfile b/dockerfile/rocm6.4.x.dockerfile index 4c080ce50..47ffaa6ce 100644 --- a/dockerfile/rocm6.4.x.dockerfile +++ b/dockerfile/rocm6.4.x.dockerfile @@ -71,7 +71,7 @@ RUN cmake_version=$(cmake --version 2>/dev/null | grep -oP "(?<=cmake version )( make install && \ rm -rf /tmp/cmake-${required_version}* \ else \ - echo "CMake version is greater than or equal to 3.24.1"; \ + echo "CMake version ${cmake_version} is greater than or equal to ${required_version}"; \ fi # Install Docker diff --git a/dockerfile/rocm7.0.x.dockerfile b/dockerfile/rocm7.0.x.dockerfile index d8f37fbe2..1df90f833 100644 --- a/dockerfile/rocm7.0.x.dockerfile +++ b/dockerfile/rocm7.0.x.dockerfile @@ -57,22 +57,11 @@ RUN apt-get update && \ ARG NUM_MAKE_JOBS=64 -# Check if CMake is installed and its version -RUN cmake_version=$(cmake --version 2>/dev/null | grep -oP "(?<=cmake version )(\d+\.\d+)" || echo "0.0") && \ - required_version="3.24.1" && \ - if [ "$(printf "%s\n" "$required_version" "$cmake_version" | sort -V | head -n 1)" != "$required_version" ]; then \ - echo "existing cmake version is ${cmake_version}" && \ - cd /tmp && \ - wget -q https://github.com/Kitware/CMake/releases/download/v${required_version}/cmake-${required_version}.tar.gz && \ - tar xzf cmake-${required_version}.tar.gz && \ - cd cmake-${required_version} && \ - ./bootstrap --prefix=/usr --no-system-curl --parallel=16 && \ - make -j ${NUM_MAKE_JOBS} && \ - make install && \ - rm -rf /tmp/cmake-${required_version}* \ - else \ - echo "CMake version is greater than or equal to 3.24.1"; \ - fi +# Install CMake via apt if not already present (Ubuntu 24.04 provides >= 3.28) +RUN if ! command -v cmake >/dev/null 2>&1; then \ + apt-get update && apt-get install -y --no-install-recommends cmake; \ + fi && \ + echo "CMake version: $(cmake --version | head -1)" # Install Docker ENV DOCKER_VERSION=20.10.8 @@ -181,9 +170,8 @@ RUN pip install "joblib>=1.4.2" && \ 's/timeout_control_job = next(iter(self\._jobs_set), None)/timeout_control_job = next(iter(set(self._jobs_set)), None)/' {} + RUN cd third_party && \ git clone -b release-staging/rocm-rel-7.0 https://github.com/ROCmSoftwarePlatform/hipBLASLt.git && \ - sed -i 's/host-x86_64-unknown-linux,/host-x86_64-unknown-linux-gnu,/' \ - hipBLASLt/tensilelite/Tensile/BuildCommands/SharedCommands.py && \ - cd hipBLASLt && ./install.sh -dc && \ + (sed -i 's/host-x86_64-unknown-linux,/host-x86_64-unknown-linux-gnu,/' hipBLASLt/tensilelite/Tensile/BuildCommands/SharedCommands.py 2>/dev/null || true) && \ + cd hipBLASLt && apt-get update -qq && ./install.sh -dc && \\ find /opt -path '*/joblib/parallel.py' -not -path '*/.git/*' -exec sed -i \ 's/timeout_control_job = next(iter(self\._jobs_set), None)/timeout_control_job = next(iter(set(self._jobs_set)), None)/' {} + && \ cp -v build/release/clients/staging/hipblaslt-bench /opt/superbench/bin/ diff --git a/dockerfile/rocm7.2.x.dockerfile b/dockerfile/rocm7.2.x.dockerfile index dc7068b2d..cabc7407b 100644 --- a/dockerfile/rocm7.2.x.dockerfile +++ b/dockerfile/rocm7.2.x.dockerfile @@ -57,22 +57,11 @@ RUN apt-get update && \ ARG NUM_MAKE_JOBS=64 -# Check if CMake is installed and its version -RUN cmake_version=$(cmake --version 2>/dev/null | grep -oP "(?<=cmake version )(\d+\.\d+)" || echo "0.0") && \ - required_version="3.24.1" && \ - if [ "$(printf "%s\n" "$required_version" "$cmake_version" | sort -V | head -n 1)" != "$required_version" ]; then \ - echo "existing cmake version is ${cmake_version}" && \ - cd /tmp && \ - wget -q https://github.com/Kitware/CMake/releases/download/v${required_version}/cmake-${required_version}.tar.gz && \ - tar xzf cmake-${required_version}.tar.gz && \ - cd cmake-${required_version} && \ - ./bootstrap --prefix=/usr --no-system-curl --parallel=16 && \ - make -j ${NUM_MAKE_JOBS} && \ - make install && \ - rm -rf /tmp/cmake-${required_version}* \ - else \ - echo "CMake version is greater than or equal to 3.24.1"; \ - fi +# Install CMake via apt if not already present (Ubuntu 24.04 provides >= 3.28) +RUN if ! command -v cmake >/dev/null 2>&1; then \ + apt-get update && apt-get install -y --no-install-recommends cmake; \ + fi && \ + echo "CMake version: $(cmake --version | head -1)" # Install Docker ENV DOCKER_VERSION=20.10.8 @@ -175,17 +164,16 @@ ADD third_party third_party # perftest_rocm6.patch changes are already upstream in the submodule version # rocm_megatron_lm: broken upstream (pretrain_deepseek.py missing in rocm_dev branch) # apex_rocm: skipped — all imports guarded, PyTorch 2.9 has native fused optimizers/AMP. -RUN make RCCL_HOME=/opt/rccl/build/ ROCBLAS_BRANCH=release-staging/rocm-rel-7.2 HIPBLASLT_BRANCH=release-staging/rocm-rel-7.2 ROCM_VER=rocm-5.5.0 -C third_party rocm -o cpu_hpl -o cpu_stream -o megatron_lm -o rocm_hipblaslt -o rocm_megatron_lm -o apex_rocm +RUN make RCCL_HOME=/opt/rccl/build/ ROCBLAS_BRANCH=release/rocm-rel-7.2 HIPBLASLT_BRANCH=release/rocm-rel-7.2 ROCM_VER=rocm-5.5.0 -C third_party rocm -o cpu_hpl -o cpu_stream -o megatron_lm -o rocm_hipblaslt -o rocm_megatron_lm -o apex_rocm # Build hipblaslt separately with Tensile target-triple fix for clang # Fix joblib race condition (github.com/joblib/joblib/issues/1788) for Python 3.12 RUN pip install "joblib>=1.4.2" && \ find / -path '*/joblib/parallel.py' -not -path '*/.git/*' -exec sed -i \ 's/timeout_control_job = next(iter(self\._jobs_set), None)/timeout_control_job = next(iter(set(self._jobs_set)), None)/' {} + RUN cd third_party && \ - git clone -b release-staging/rocm-rel-7.2 https://github.com/ROCmSoftwarePlatform/hipBLASLt.git && \ - sed -i 's/host-x86_64-unknown-linux,/host-x86_64-unknown-linux-gnu,/' \ - hipBLASLt/tensilelite/Tensile/BuildCommands/SharedCommands.py && \ - cd hipBLASLt && ./install.sh -dc && \ + git clone -b release/rocm-rel-7.2 https://github.com/ROCmSoftwarePlatform/hipBLASLt.git && \ + (sed -i 's/host-x86_64-unknown-linux,/host-x86_64-unknown-linux-gnu,/' hipBLASLt/tensilelite/Tensile/BuildCommands/SharedCommands.py 2>/dev/null || true) && \ + cd hipBLASLt && apt-get update -qq && ./install.sh -dc && \\ find /opt -path '*/joblib/parallel.py' -not -path '*/.git/*' -exec sed -i \ 's/timeout_control_job = next(iter(self\._jobs_set), None)/timeout_control_job = next(iter(set(self._jobs_set)), None)/' {} + && \ cp -v build/release/clients/staging/hipblaslt-bench /opt/superbench/bin/ From 4500e8605310b81ae704039316e6bd317e833678 Mon Sep 17 00:00:00 2001 From: Zheyu Shen Date: Wed, 15 Apr 2026 09:38:22 +0000 Subject: [PATCH 07/20] fix --- dockerfile/rocm7.0.x.dockerfile | 2 +- dockerfile/rocm7.2.x.dockerfile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dockerfile/rocm7.0.x.dockerfile b/dockerfile/rocm7.0.x.dockerfile index 1df90f833..23d7178f9 100644 --- a/dockerfile/rocm7.0.x.dockerfile +++ b/dockerfile/rocm7.0.x.dockerfile @@ -171,7 +171,7 @@ RUN pip install "joblib>=1.4.2" && \ RUN cd third_party && \ git clone -b release-staging/rocm-rel-7.0 https://github.com/ROCmSoftwarePlatform/hipBLASLt.git && \ (sed -i 's/host-x86_64-unknown-linux,/host-x86_64-unknown-linux-gnu,/' hipBLASLt/tensilelite/Tensile/BuildCommands/SharedCommands.py 2>/dev/null || true) && \ - cd hipBLASLt && apt-get update -qq && ./install.sh -dc && \\ + cd hipBLASLt && apt-get update -qq && ./install.sh -dc && \ find /opt -path '*/joblib/parallel.py' -not -path '*/.git/*' -exec sed -i \ 's/timeout_control_job = next(iter(self\._jobs_set), None)/timeout_control_job = next(iter(set(self._jobs_set)), None)/' {} + && \ cp -v build/release/clients/staging/hipblaslt-bench /opt/superbench/bin/ diff --git a/dockerfile/rocm7.2.x.dockerfile b/dockerfile/rocm7.2.x.dockerfile index cabc7407b..bffa437e8 100644 --- a/dockerfile/rocm7.2.x.dockerfile +++ b/dockerfile/rocm7.2.x.dockerfile @@ -173,7 +173,7 @@ RUN pip install "joblib>=1.4.2" && \ RUN cd third_party && \ git clone -b release/rocm-rel-7.2 https://github.com/ROCmSoftwarePlatform/hipBLASLt.git && \ (sed -i 's/host-x86_64-unknown-linux,/host-x86_64-unknown-linux-gnu,/' hipBLASLt/tensilelite/Tensile/BuildCommands/SharedCommands.py 2>/dev/null || true) && \ - cd hipBLASLt && apt-get update -qq && ./install.sh -dc && \\ + cd hipBLASLt && apt-get update -qq && ./install.sh -dc && \ find /opt -path '*/joblib/parallel.py' -not -path '*/.git/*' -exec sed -i \ 's/timeout_control_job = next(iter(self\._jobs_set), None)/timeout_control_job = next(iter(set(self._jobs_set)), None)/' {} + && \ cp -v build/release/clients/staging/hipblaslt-bench /opt/superbench/bin/ From ea934b753f82affff021d14ddaa35ede939c7108 Mon Sep 17 00:00:00 2001 From: Zheyu Shen Date: Wed, 15 Apr 2026 20:54:18 +0000 Subject: [PATCH 08/20] try alternative way --- dockerfile/rocm7.0.x.dockerfile | 35 +++++++++++++++++++++------------ dockerfile/rocm7.2.x.dockerfile | 35 +++++++++++++++++++++------------ 2 files changed, 44 insertions(+), 26 deletions(-) diff --git a/dockerfile/rocm7.0.x.dockerfile b/dockerfile/rocm7.0.x.dockerfile index 23d7178f9..675e18fa8 100644 --- a/dockerfile/rocm7.0.x.dockerfile +++ b/dockerfile/rocm7.0.x.dockerfile @@ -163,20 +163,29 @@ ADD third_party third_party # rocm_megatron_lm: broken upstream (pretrain_deepseek.py missing in rocm_dev branch) # apex_rocm: skipped — all imports guarded, PyTorch 2.9 has native fused optimizers/AMP. RUN make RCCL_HOME=/opt/rccl/build/ ROCBLAS_BRANCH=release-staging/rocm-rel-7.0 HIPBLASLT_BRANCH=release-staging/rocm-rel-7.0 ROCM_VER=rocm-5.5.0 -C third_party rocm -o cpu_hpl -o cpu_stream -o megatron_lm -o rocm_hipblaslt -o rocm_megatron_lm -o apex_rocm -# Build hipblaslt separately with Tensile target-triple fix for clang -# Fix joblib race condition (github.com/joblib/joblib/issues/1788) for Python 3.12 -RUN pip install "joblib>=1.4.2" && \ - find / -path '*/joblib/parallel.py' -not -path '*/.git/*' -exec sed -i \ - 's/timeout_control_job = next(iter(self\._jobs_set), None)/timeout_control_job = next(iter(set(self._jobs_set)), None)/' {} + +# Build hipblaslt-bench only (not the library) against system-installed hipBLASLt. +# Cannot build full hipBLASLt from source (requires AMD-internal 'origami' library). +# Strategy: clone repo, sed out origami references, disable host lib & device/Tensile, +# build only the client (hipblaslt-bench) linking against system roc::hipblaslt. RUN cd third_party && \ - git clone -b release-staging/rocm-rel-7.0 https://github.com/ROCmSoftwarePlatform/hipBLASLt.git && \ - (sed -i 's/host-x86_64-unknown-linux,/host-x86_64-unknown-linux-gnu,/' hipBLASLt/tensilelite/Tensile/BuildCommands/SharedCommands.py 2>/dev/null || true) && \ - cd hipBLASLt && apt-get update -qq && ./install.sh -dc && \ - find /opt -path '*/joblib/parallel.py' -not -path '*/.git/*' -exec sed -i \ - 's/timeout_control_job = next(iter(self\._jobs_set), None)/timeout_control_job = next(iter(set(self._jobs_set)), None)/' {} + && \ - cp -v build/release/clients/staging/hipblaslt-bench /opt/superbench/bin/ -RUN cp -r /opt/superbench/third_party/hipBLASLt/build/release/hipblaslt-install/lib/* /opt/rocm/lib/ && \ - cp -r /opt/superbench/third_party/hipBLASLt/build/release/hipblaslt-install/include/* /opt/rocm/include/ + git clone --depth 1 -b release-staging/rocm-rel-7.0 https://github.com/ROCmSoftwarePlatform/hipBLASLt.git && \ + cd hipBLASLt && \ + sed -i '/origami/d' CMakeLists.txt tensilelite/CMakeLists.txt && \ + sed -i '/mxdatagenerator\|mxDataGenerator/d' clients/CMakeLists.txt && \ + mkdir -p build/release && cd build/release && \ + CMAKE_POLICY_VERSION_MINIMUM= cmake \ + -DHIPBLASLT_ENABLE_HOST=OFF \ + -DHIPBLASLT_ENABLE_DEVICE=OFF \ + -DHIPBLASLT_ENABLE_CLIENT=ON \ + -DHIPBLASLT_ENABLE_ROCROLLER=OFF \ + -DHIPBLASLT_BUILD_TESTING=OFF \ + -DHIPBLASLT_ENABLE_SAMPLES=OFF \ + -DHIPBLASLT_ENABLE_LLVM=OFF \ + -DCMAKE_PREFIX_PATH=/opt/rocm \ + -DCMAKE_BUILD_TYPE=Release \ + ../.. && \ + make -j$(nproc) hipblaslt-bench && \ + cp -v clients/hipblaslt-bench /opt/superbench/bin/ RUN cd third_party/Megatron/Megatron-DeepSpeed && \ git apply ../megatron_deepspeed_rocm6.patch diff --git a/dockerfile/rocm7.2.x.dockerfile b/dockerfile/rocm7.2.x.dockerfile index bffa437e8..0d4972c2b 100644 --- a/dockerfile/rocm7.2.x.dockerfile +++ b/dockerfile/rocm7.2.x.dockerfile @@ -165,20 +165,29 @@ ADD third_party third_party # rocm_megatron_lm: broken upstream (pretrain_deepseek.py missing in rocm_dev branch) # apex_rocm: skipped — all imports guarded, PyTorch 2.9 has native fused optimizers/AMP. RUN make RCCL_HOME=/opt/rccl/build/ ROCBLAS_BRANCH=release/rocm-rel-7.2 HIPBLASLT_BRANCH=release/rocm-rel-7.2 ROCM_VER=rocm-5.5.0 -C third_party rocm -o cpu_hpl -o cpu_stream -o megatron_lm -o rocm_hipblaslt -o rocm_megatron_lm -o apex_rocm -# Build hipblaslt separately with Tensile target-triple fix for clang -# Fix joblib race condition (github.com/joblib/joblib/issues/1788) for Python 3.12 -RUN pip install "joblib>=1.4.2" && \ - find / -path '*/joblib/parallel.py' -not -path '*/.git/*' -exec sed -i \ - 's/timeout_control_job = next(iter(self\._jobs_set), None)/timeout_control_job = next(iter(set(self._jobs_set)), None)/' {} + +# Build hipblaslt-bench only (not the library) against system-installed hipBLASLt. +# Cannot build full hipBLASLt from source (requires AMD-internal 'origami' library). +# Strategy: clone repo, sed out origami references, disable host lib & device/Tensile, +# build only the client (hipblaslt-bench) linking against system roc::hipblaslt. RUN cd third_party && \ - git clone -b release/rocm-rel-7.2 https://github.com/ROCmSoftwarePlatform/hipBLASLt.git && \ - (sed -i 's/host-x86_64-unknown-linux,/host-x86_64-unknown-linux-gnu,/' hipBLASLt/tensilelite/Tensile/BuildCommands/SharedCommands.py 2>/dev/null || true) && \ - cd hipBLASLt && apt-get update -qq && ./install.sh -dc && \ - find /opt -path '*/joblib/parallel.py' -not -path '*/.git/*' -exec sed -i \ - 's/timeout_control_job = next(iter(self\._jobs_set), None)/timeout_control_job = next(iter(set(self._jobs_set)), None)/' {} + && \ - cp -v build/release/clients/staging/hipblaslt-bench /opt/superbench/bin/ -RUN cp -r /opt/superbench/third_party/hipBLASLt/build/release/hipblaslt-install/lib/* /opt/rocm/lib/ && \ - cp -r /opt/superbench/third_party/hipBLASLt/build/release/hipblaslt-install/include/* /opt/rocm/include/ + git clone --depth 1 -b release/rocm-rel-7.2 https://github.com/ROCmSoftwarePlatform/hipBLASLt.git && \ + cd hipBLASLt && \ + sed -i '/origami/d' CMakeLists.txt tensilelite/CMakeLists.txt && \ + sed -i '/mxdatagenerator\|mxDataGenerator/d' clients/CMakeLists.txt && \ + mkdir -p build/release && cd build/release && \ + CMAKE_POLICY_VERSION_MINIMUM= cmake \ + -DHIPBLASLT_ENABLE_HOST=OFF \ + -DHIPBLASLT_ENABLE_DEVICE=OFF \ + -DHIPBLASLT_ENABLE_CLIENT=ON \ + -DHIPBLASLT_ENABLE_ROCROLLER=OFF \ + -DHIPBLASLT_BUILD_TESTING=OFF \ + -DHIPBLASLT_ENABLE_SAMPLES=OFF \ + -DHIPBLASLT_ENABLE_LLVM=OFF \ + -DCMAKE_PREFIX_PATH=/opt/rocm \ + -DCMAKE_BUILD_TYPE=Release \ + ../.. && \ + make -j$(nproc) hipblaslt-bench && \ + cp -v clients/hipblaslt-bench /opt/superbench/bin/ RUN cd third_party/Megatron/Megatron-DeepSpeed && \ git apply ../megatron_deepspeed_rocm6.patch From a4e354a8a139cde0c41443919da733e5faf0330a Mon Sep 17 00:00:00 2001 From: Zheyu Shen Date: Wed, 15 Apr 2026 21:48:17 +0000 Subject: [PATCH 09/20] fix --- dockerfile/rocm7.0.x.dockerfile | 20 ++++++++------------ dockerfile/rocm7.2.x.dockerfile | 4 +--- 2 files changed, 9 insertions(+), 15 deletions(-) diff --git a/dockerfile/rocm7.0.x.dockerfile b/dockerfile/rocm7.0.x.dockerfile index 675e18fa8..cfae382d7 100644 --- a/dockerfile/rocm7.0.x.dockerfile +++ b/dockerfile/rocm7.0.x.dockerfile @@ -164,28 +164,24 @@ ADD third_party third_party # apex_rocm: skipped — all imports guarded, PyTorch 2.9 has native fused optimizers/AMP. RUN make RCCL_HOME=/opt/rccl/build/ ROCBLAS_BRANCH=release-staging/rocm-rel-7.0 HIPBLASLT_BRANCH=release-staging/rocm-rel-7.0 ROCM_VER=rocm-5.5.0 -C third_party rocm -o cpu_hpl -o cpu_stream -o megatron_lm -o rocm_hipblaslt -o rocm_megatron_lm -o apex_rocm # Build hipblaslt-bench only (not the library) against system-installed hipBLASLt. -# Cannot build full hipBLASLt from source (requires AMD-internal 'origami' library). -# Strategy: clone repo, sed out origami references, disable host lib & device/Tensile, -# build only the client (hipblaslt-bench) linking against system roc::hipblaslt. +# Build hipblaslt-bench only against system hipBLASLt. +# 7.0 uses HIPBLASLT_USE_ROCROLLER (not ENABLE), BUILD_CLIENTS_BENCHMARKS, Tensile_SKIP_BUILD. RUN cd third_party && \ git clone --depth 1 -b release-staging/rocm-rel-7.0 https://github.com/ROCmSoftwarePlatform/hipBLASLt.git && \ cd hipBLASLt && \ - sed -i '/origami/d' CMakeLists.txt tensilelite/CMakeLists.txt && \ sed -i '/mxdatagenerator\|mxDataGenerator/d' clients/CMakeLists.txt && \ mkdir -p build/release && cd build/release && \ CMAKE_POLICY_VERSION_MINIMUM= cmake \ - -DHIPBLASLT_ENABLE_HOST=OFF \ - -DHIPBLASLT_ENABLE_DEVICE=OFF \ - -DHIPBLASLT_ENABLE_CLIENT=ON \ - -DHIPBLASLT_ENABLE_ROCROLLER=OFF \ - -DHIPBLASLT_BUILD_TESTING=OFF \ - -DHIPBLASLT_ENABLE_SAMPLES=OFF \ - -DHIPBLASLT_ENABLE_LLVM=OFF \ + -DHIPBLASLT_USE_ROCROLLER=OFF \ + -DBUILD_CLIENTS_BENCHMARKS=ON \ + -DBUILD_CLIENTS_TESTS=OFF \ + -DBUILD_CLIENTS_SAMPLES=OFF \ + -DTensile_SKIP_BUILD=ON \ -DCMAKE_PREFIX_PATH=/opt/rocm \ -DCMAKE_BUILD_TYPE=Release \ ../.. && \ make -j$(nproc) hipblaslt-bench && \ - cp -v clients/hipblaslt-bench /opt/superbench/bin/ + cp -v clients/staging/hipblaslt-bench /opt/superbench/bin/ RUN cd third_party/Megatron/Megatron-DeepSpeed && \ git apply ../megatron_deepspeed_rocm6.patch diff --git a/dockerfile/rocm7.2.x.dockerfile b/dockerfile/rocm7.2.x.dockerfile index 0d4972c2b..cd8aec6d2 100644 --- a/dockerfile/rocm7.2.x.dockerfile +++ b/dockerfile/rocm7.2.x.dockerfile @@ -166,9 +166,7 @@ ADD third_party third_party # apex_rocm: skipped — all imports guarded, PyTorch 2.9 has native fused optimizers/AMP. RUN make RCCL_HOME=/opt/rccl/build/ ROCBLAS_BRANCH=release/rocm-rel-7.2 HIPBLASLT_BRANCH=release/rocm-rel-7.2 ROCM_VER=rocm-5.5.0 -C third_party rocm -o cpu_hpl -o cpu_stream -o megatron_lm -o rocm_hipblaslt -o rocm_megatron_lm -o apex_rocm # Build hipblaslt-bench only (not the library) against system-installed hipBLASLt. -# Cannot build full hipBLASLt from source (requires AMD-internal 'origami' library). -# Strategy: clone repo, sed out origami references, disable host lib & device/Tensile, -# build only the client (hipblaslt-bench) linking against system roc::hipblaslt. +# Origami is AMD-internal; sed it out. ROCROLLER disabled via flag, mxDataGenerator removed. RUN cd third_party && \ git clone --depth 1 -b release/rocm-rel-7.2 https://github.com/ROCmSoftwarePlatform/hipBLASLt.git && \ cd hipBLASLt && \ From 97acd02a5c2ea60c30df19c7457dfc2079ee8bb6 Mon Sep 17 00:00:00 2001 From: Zheyu Shen Date: Tue, 21 Apr 2026 23:39:42 +0000 Subject: [PATCH 10/20] more fixes --- dockerfile/rocm7.0.x.dockerfile | 8 +++++++- dockerfile/rocm7.2.x.dockerfile | 5 ++++- third_party/Makefile | 3 +++ 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/dockerfile/rocm7.0.x.dockerfile b/dockerfile/rocm7.0.x.dockerfile index cfae382d7..b8faa3751 100644 --- a/dockerfile/rocm7.0.x.dockerfile +++ b/dockerfile/rocm7.0.x.dockerfile @@ -170,6 +170,12 @@ RUN cd third_party && \ git clone --depth 1 -b release-staging/rocm-rel-7.0 https://github.com/ROCmSoftwarePlatform/hipBLASLt.git && \ cd hipBLASLt && \ sed -i '/mxdatagenerator\|mxDataGenerator/d' clients/CMakeLists.txt && \ + sed -i '/cmake_policy( SET CMP0037 OLD )/d; s/add_custom_target( install/add_custom_target( hipblaslt_deps_install/' deps/CMakeLists.txt && \ + perl -0pi -e 's/set\(\s*gtest_custom_target\s+COMMAND\s+cd\s+\$\{GTEST_BINARY_ROOT\}\$\s+\$\{CMAKE_COMMAND\}\s+--build\s+\.\s+--target\s+install\s*\)/set( gtest_custom_target COMMAND \${CMAKE_COMMAND} --build \${GTEST_BINARY_ROOT} --target install )/s; s/set\(\s*lapack_custom_target\s+COMMAND\s+cd\s+\$\{LAPACK_BINARY_ROOT\}\$\s+\$\{CMAKE_COMMAND\}\s+--build\s+\.\s+--target\s+install\s*\)/set( lapack_custom_target COMMAND \${CMAKE_COMMAND} --build \${LAPACK_BINARY_ROOT} --target install )/s' deps/CMakeLists.txt && \ + # Pre-build the cblas/lapack dependency (normally done by ./install.sh -d). + # install.sh -dc builds the full library which we want to skip; build deps standalone. + mkdir -p deps/build && cd deps/build && \ + CMAKE_POLICY_VERSION_MINIMUM=3.5 cmake .. && cmake --build . -j$(nproc) --target hipblaslt_deps_install && cd ../.. && \ mkdir -p build/release && cd build/release && \ CMAKE_POLICY_VERSION_MINIMUM= cmake \ -DHIPBLASLT_USE_ROCROLLER=OFF \ @@ -177,7 +183,7 @@ RUN cd third_party && \ -DBUILD_CLIENTS_TESTS=OFF \ -DBUILD_CLIENTS_SAMPLES=OFF \ -DTensile_SKIP_BUILD=ON \ - -DCMAKE_PREFIX_PATH=/opt/rocm \ + -DCMAKE_PREFIX_PATH="/opt/rocm;/usr/local" \ -DCMAKE_BUILD_TYPE=Release \ ../.. && \ make -j$(nproc) hipblaslt-bench && \ diff --git a/dockerfile/rocm7.2.x.dockerfile b/dockerfile/rocm7.2.x.dockerfile index cd8aec6d2..64dd4be7f 100644 --- a/dockerfile/rocm7.2.x.dockerfile +++ b/dockerfile/rocm7.2.x.dockerfile @@ -172,6 +172,9 @@ RUN cd third_party && \ cd hipBLASLt && \ sed -i '/origami/d' CMakeLists.txt tensilelite/CMakeLists.txt && \ sed -i '/mxdatagenerator\|mxDataGenerator/d' clients/CMakeLists.txt && \ + # Pre-build the cblas/lapack dependency (normally done by ./install.sh -d). + mkdir -p deps/build && cd deps/build && \ + CMAKE_POLICY_VERSION_MINIMUM=3.5 cmake .. && make -j$(nproc) && cd ../.. && \ mkdir -p build/release && cd build/release && \ CMAKE_POLICY_VERSION_MINIMUM= cmake \ -DHIPBLASLT_ENABLE_HOST=OFF \ @@ -181,7 +184,7 @@ RUN cd third_party && \ -DHIPBLASLT_BUILD_TESTING=OFF \ -DHIPBLASLT_ENABLE_SAMPLES=OFF \ -DHIPBLASLT_ENABLE_LLVM=OFF \ - -DCMAKE_PREFIX_PATH=/opt/rocm \ + -DCMAKE_PREFIX_PATH="/opt/rocm;/usr/local" \ -DCMAKE_BUILD_TYPE=Release \ ../.. && \ make -j$(nproc) hipblaslt-bench && \ diff --git a/third_party/Makefile b/third_party/Makefile index 2a09f5990..ad33da25c 100755 --- a/third_party/Makefile +++ b/third_party/Makefile @@ -168,8 +168,11 @@ rocm_hipblaslt: sb_micro_path # Build hipBusBandwidth. # HIP is released with rocm, like rocm-4.2.0 and so on. # The version we use is the released tag which is consistent with the rocm version in the environment or docker. +# Inject include: newer clang (ROCm >= 7) rejects use of assert() without it. rocm_bandwidthTest: sb_micro_path git clone -b ${ROCM_VER} https://github.com/ROCm-Developer-Tools/HIP.git + grep -q '' ./HIP/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp || \ + sed -i '1i #include ' ./HIP/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp cd ./HIP/samples/1_Utils/hipBusBandwidth/ && mkdir -p build && cd build && cmake .. && make cp -v ./HIP/samples/1_Utils/hipBusBandwidth/build/hipBusBandwidth $(SB_MICRO_PATH)/bin/ From a2c9b463a00019377419360ed454dd6654d271ac Mon Sep 17 00:00:00 2001 From: Zheyu Shen Date: Wed, 22 Apr 2026 20:59:07 +0000 Subject: [PATCH 11/20] fix hipblaslt for 7.0 --- dockerfile/rocm7.0.x.dockerfile | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/dockerfile/rocm7.0.x.dockerfile b/dockerfile/rocm7.0.x.dockerfile index b8faa3751..a77dee114 100644 --- a/dockerfile/rocm7.0.x.dockerfile +++ b/dockerfile/rocm7.0.x.dockerfile @@ -170,12 +170,17 @@ RUN cd third_party && \ git clone --depth 1 -b release-staging/rocm-rel-7.0 https://github.com/ROCmSoftwarePlatform/hipBLASLt.git && \ cd hipBLASLt && \ sed -i '/mxdatagenerator\|mxDataGenerator/d' clients/CMakeLists.txt && \ + sed -i 's/if(OS_RELEASE MATCHES "Ubuntu")/if(FALSE AND OS_RELEASE MATCHES "Ubuntu")/' clients/benchmarks/CMakeLists.txt && \ + sed -i '/add_dependencies(TENSILE_LIBRARY_TARGET rocisa)/d' library/src/amd_detail/rocblaslt/src/CMakeLists.txt && \ sed -i '/cmake_policy( SET CMP0037 OLD )/d; s/add_custom_target( install/add_custom_target( hipblaslt_deps_install/' deps/CMakeLists.txt && \ - perl -0pi -e 's/set\(\s*gtest_custom_target\s+COMMAND\s+cd\s+\$\{GTEST_BINARY_ROOT\}\$\s+\$\{CMAKE_COMMAND\}\s+--build\s+\.\s+--target\s+install\s*\)/set( gtest_custom_target COMMAND \${CMAKE_COMMAND} --build \${GTEST_BINARY_ROOT} --target install )/s; s/set\(\s*lapack_custom_target\s+COMMAND\s+cd\s+\$\{LAPACK_BINARY_ROOT\}\$\s+\$\{CMAKE_COMMAND\}\s+--build\s+\.\s+--target\s+install\s*\)/set( lapack_custom_target COMMAND \${CMAKE_COMMAND} --build \${LAPACK_BINARY_ROOT} --target install )/s' deps/CMakeLists.txt && \ # Pre-build the cblas/lapack dependency (normally done by ./install.sh -d). # install.sh -dc builds the full library which we want to skip; build deps standalone. mkdir -p deps/build && cd deps/build && \ - CMAKE_POLICY_VERSION_MINIMUM=3.5 cmake .. && cmake --build . -j$(nproc) --target hipblaslt_deps_install && cd ../.. && \ + CMAKE_POLICY_VERSION_MINIMUM=3.5 cmake .. && \ + cmake --build . -j$(nproc) --target googletest lapack && \ + cmake --build gtest/src/googletest-build -j$(nproc) --target install && \ + cmake --build lapack/src/lapack-build -j$(nproc) --target install && \ + cd ../.. && \ mkdir -p build/release && cd build/release && \ CMAKE_POLICY_VERSION_MINIMUM= cmake \ -DHIPBLASLT_USE_ROCROLLER=OFF \ From 1344492a6624c17bde94f1c47e101a150e329048 Mon Sep 17 00:00:00 2001 From: Zheyu Shen Date: Thu, 23 Apr 2026 21:26:34 +0000 Subject: [PATCH 12/20] fix ROCm 7.0 --- dockerfile/rocm7.0.x.dockerfile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dockerfile/rocm7.0.x.dockerfile b/dockerfile/rocm7.0.x.dockerfile index a77dee114..0b0907c15 100644 --- a/dockerfile/rocm7.0.x.dockerfile +++ b/dockerfile/rocm7.0.x.dockerfile @@ -201,6 +201,8 @@ RUN cd third_party/Megatron/Megatron-DeepSpeed && \ RUN git clone --recursive https://github.com/ROCm/TransformerEngine.git && \ cd TransformerEngine && \ NVTE_FRAMEWORK=pytorch \ + NVTE_FUSED_ATTN_CK=0 \ + NVTE_FUSED_ATTN_AOTRITON=1 \ NVTE_ROCM_ARCH="gfx942;gfx950" \ python3 setup.py install RUN python3 -c "import transformer_engine.pytorch; print('TE installed successfully')" From 47a7620cd3677b9edf6fd7814460f39711fc4740 Mon Sep 17 00:00:00 2001 From: Zheyu Shen Date: Wed, 29 Apr 2026 18:01:48 +0000 Subject: [PATCH 13/20] fix benchmark parsing --- .../micro_benchmarks/hipblaslt_function.py | 51 ++++++++++++++----- .../test_hipblaslt_function.py | 24 +++++++++ 2 files changed, 62 insertions(+), 13 deletions(-) diff --git a/superbench/benchmarks/micro_benchmarks/hipblaslt_function.py b/superbench/benchmarks/micro_benchmarks/hipblaslt_function.py index 3feb582d9..16220b92e 100644 --- a/superbench/benchmarks/micro_benchmarks/hipblaslt_function.py +++ b/superbench/benchmarks/micro_benchmarks/hipblaslt_function.py @@ -97,6 +97,11 @@ def _process_raw_result(self, cmd_idx, raw_output): self._result.add_raw_data() and self._result.add_result() need to be called to save the results. + The hipblaslt-bench output schema has grown over ROCm releases (older + versions emit 23 columns, newer versions emit 33+ columns including + scale factors, GB/s, etc.). To be robust across versions, this parser + looks up columns by header name rather than by fixed index. + Args: cmd_idx (int): the index of command corresponding with the raw_output. raw_output (str): raw output string of the micro-benchmark. @@ -108,27 +113,47 @@ def _process_raw_result(self, cmd_idx, raw_output): try: lines = raw_output.splitlines() - index = None - # Find the line containing 'hipblaslt-Gflops' + # Locate the header line (contains 'hipblaslt-Gflops'). + header_idx = None for i, line in enumerate(lines): if 'hipblaslt-Gflops' in line: - index = i + header_idx = i break - if index is None: - raise ValueError('Line with "hipblaslt-Gflops" not found in the log.') - - # Split the line into fields using a comma as the delimiter - fields = lines[index + 1].strip().split(',') + if header_idx is None or header_idx + 1 >= len(lines): + raise ValueError('Header line with "hipblaslt-Gflops" not found in the log.') + + # Parse header. The first column may carry a "[N]" or "[N]:" prefix + # (e.g. "[0]:transA" or "[0]transA"); strip it so column names match. + header_fields = [h.strip() for h in lines[header_idx].split(',')] + first_col = header_fields[0] + if ']' in first_col: + first_col = first_col.split(']', 1)[1].lstrip(':').strip() + header_fields[0] = first_col + + # Build a name -> index map. + header_index = {name: idx for idx, name in enumerate(header_fields)} + for required in ('batch_count', 'm', 'n', 'k', 'hipblaslt-Gflops'): + if required not in header_index: + raise ValueError(f'Required column "{required}" not found in header.') + + # Parse the data row (immediately after the header). + data_fields = [v.strip() for v in lines[header_idx + 1].strip().split(',')] + if len(data_fields) != len(header_fields): + raise ValueError( + f'Data row has {len(data_fields)} fields but header has {len(header_fields)}.' + ) - # Check the number of fields and the format of the first two fields - if len(fields) != 23: - raise ValueError('Invalid result') + batch_count = data_fields[header_index['batch_count']] + m = data_fields[header_index['m']] + n = data_fields[header_index['n']] + k = data_fields[header_index['k']] + gflops = float(data_fields[header_index['hipblaslt-Gflops']]) self._result.add_result( - f'{self._precision_in_commands[cmd_idx]}_{fields[3]}_{"_".join(fields[4:7])}_flops', - float(fields[-2]) / 1000 + f'{self._precision_in_commands[cmd_idx]}_{batch_count}_{m}_{n}_{k}_flops', + gflops / 1000, ) except BaseException as e: self._result.set_return_code(ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE) diff --git a/tests/benchmarks/micro_benchmarks/test_hipblaslt_function.py b/tests/benchmarks/micro_benchmarks/test_hipblaslt_function.py index a93d93fbb..297cfcc09 100644 --- a/tests/benchmarks/micro_benchmarks/test_hipblaslt_function.py +++ b/tests/benchmarks/micro_benchmarks/test_hipblaslt_function.py @@ -108,5 +108,29 @@ def test_hipblaslt_gemm_result_parsing(self): self.assertEqual(2, len(benchmark.result)) self.assertEqual(58.6245, benchmark.result['fp16_1_896_896_896_flops'][0]) + # Positive case - newer hipblaslt-bench schema with 33 columns + # (adds a_type/b_type/c_type, scaleA/B/C/D, amaxD, bias_type, GB/s). + # The Gflops column is no longer the second-to-last field, so a + # robust parser must look up columns by name from the header row. + new_format_raw_output = """ +hipBLASLt version: 1200 +hipBLASLt git version: 5d47b8b46-dirty +Query device success: there are 1 devices +------------------------------------------------------------------------------- +Device ID 0 : AMD Instinct MI300X VF gfx942:sramecc+:xnack- +with 205.6 GB memory, max. SCLK 2100 MHz, max. MCLK 1300 MHz, compute capability 9.4 +maxGridDimX 2147483647, sharedMemPerBlock 65.5 KB, maxThreadsPerBlock 1024, warpSize 64 +------------------------------------------------------------------------------- + +Is supported 1 / Total solutions: 1 +[0]:transA,transB,grouped_gemm,batch_count,m,n,k,alpha,lda,stride_a,beta,ldb,stride_b,ldc,stride_c,ldd,stride_d,a_type,b_type,c_type,d_type,compute_type,scaleA,scaleB,scaleC,scaleD,amaxD,activation_type,bias_vector,bias_type,hipblaslt-Gflops,hipblaslt-GB/s,us + N,N,0,1,4096,4096,4096,1,4096,16777216,0,4096,16777216,4096,16777216,4096,16777216,f32_r,f32_r,f32_r,f32_r,f32_r,0,0,0,0,0,none,0,f32_r,134751,183.833,1019.95 +""" + benchmark._result = BenchmarkResult(self.benchmark_name, BenchmarkType.MICRO, ReturnCode.SUCCESS, run_count=1) + self.assertTrue(benchmark._process_raw_result(0, new_format_raw_output)) + self.assertEqual(ReturnCode.SUCCESS, benchmark.return_code) + self.assertEqual(2, len(benchmark.result)) + self.assertEqual(134.751, benchmark.result['fp16_1_4096_4096_4096_flops'][0]) + # Negative case - invalid raw output self.assertFalse(benchmark._process_raw_result(1, 'HipBLAS API failed')) From b9df63b6035c77862c2b942802bd7d63d9e71458 Mon Sep 17 00:00:00 2001 From: Zheyu Shen Date: Thu, 30 Apr 2026 21:02:06 +0000 Subject: [PATCH 14/20] try more stuff Co-authored-by: Copilot --- .../etc/hipblaslt-bench-standalone.cmake | 114 ++++++++++++++++++ dockerfile/rocm7.2.x.dockerfile | 43 ++++--- 2 files changed, 139 insertions(+), 18 deletions(-) create mode 100644 dockerfile/etc/hipblaslt-bench-standalone.cmake diff --git a/dockerfile/etc/hipblaslt-bench-standalone.cmake b/dockerfile/etc/hipblaslt-bench-standalone.cmake new file mode 100644 index 000000000..33d48daa7 --- /dev/null +++ b/dockerfile/etc/hipblaslt-bench-standalone.cmake @@ -0,0 +1,114 @@ +# Standalone CMake for building hipblaslt-bench against system-installed +# hipBLASLt, bypassing the upstream build system. +# +# Used by dockerfile/rocm7.2.x.dockerfile because the upstream 7.2 source +# tree pulls in AMD-internal "origami" headers and a new tensilelite-host +# C++ library that conflict with the goal of building only the bench tool. +# +# Place this file at the root of an upstream hipBLASLt source tree +# (e.g. cp this to /path/to/hipBLASLt/CMakeLists-bench.txt) and invoke: +# +# cmake -B build -S /path/to/hipBLASLt -P /path/to/this/file +# +# Or use it as the top-level CMakeLists.txt by overwriting it. + +cmake_minimum_required(VERSION 3.21) +project(hipblaslt-bench-standalone LANGUAGES CXX HIP) + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_HIP_STANDARD 17) +set(CMAKE_POSITION_INDEPENDENT_CODE ON) + +# The directory containing this build script is the hipBLASLt source root. +set(HIPBLASLT_SRC "${CMAKE_CURRENT_SOURCE_DIR}") + +# --- Dependencies (all from system / preinstalled) --- +find_package(hip REQUIRED) +find_package(hipblaslt CONFIG REQUIRED) +find_package(LAPACK REQUIRED) # also brings BLAS via implicit find_package(BLAS) +find_package(OpenMP REQUIRED) +find_package(rocm_smi) # optional + +# --- The bench static helper library --- +add_library(hipblaslt-clients-common STATIC + "${HIPBLASLT_SRC}/clients/common/src/singletons.cpp" + "${HIPBLASLT_SRC}/clients/common/src/utility.cpp" + "${HIPBLASLT_SRC}/clients/common/src/efficiency_monitor.cpp" + "${HIPBLASLT_SRC}/clients/common/src/cblas_interface.cpp" + "${HIPBLASLT_SRC}/clients/common/src/argument_model.cpp" + "${HIPBLASLT_SRC}/clients/common/src/hipblaslt_parse_data.cpp" + "${HIPBLASLT_SRC}/clients/common/src/hipblaslt_arguments.cpp" + "${HIPBLASLT_SRC}/clients/common/src/hipblaslt_random.cpp" + "${HIPBLASLT_SRC}/clients/common/src/hipblaslt_init_device.cpp" +) + +# These .cpp files are HIP code (use __device__/__host__, hip_runtime APIs, +# half/bfloat16 types). Compiling them as plain CXX with gcc fails. Force HIP. +set_source_files_properties( + "${HIPBLASLT_SRC}/clients/common/src/utility.cpp" + "${HIPBLASLT_SRC}/clients/common/src/cblas_interface.cpp" + "${HIPBLASLT_SRC}/clients/common/src/hipblaslt_init_device.cpp" + "${HIPBLASLT_SRC}/clients/common/src/hipblaslt_arguments.cpp" + "${HIPBLASLT_SRC}/clients/common/src/hipblaslt_random.cpp" + "${HIPBLASLT_SRC}/clients/common/src/argument_model.cpp" + "${HIPBLASLT_SRC}/clients/common/src/hipblaslt_parse_data.cpp" + "${HIPBLASLT_SRC}/clients/common/src/efficiency_monitor.cpp" + "${HIPBLASLT_SRC}/clients/common/src/singletons.cpp" + PROPERTIES LANGUAGE HIP +) + +target_include_directories(hipblaslt-clients-common + PUBLIC + "${HIPBLASLT_SRC}/clients/common/include" + "${HIPBLASLT_SRC}/clients/bench/include" + # Source's library/include comes BEFORE system include so we get + # hipblaslt_xfloat32.h (not shipped in the system install). + "${HIPBLASLT_SRC}/library/include" + # Internal headers used by clients (rocblaslt/rocblaslt-types.h etc.) + "${HIPBLASLT_SRC}/library/src/amd_detail/include" + "${HIPBLASLT_SRC}/library/src/amd_detail/rocblaslt/include" + "${HIPBLASLT_SRC}/library/src/amd_detail/rocblaslt/src/include" +) + +target_compile_definitions(hipblaslt-clients-common + PUBLIC + # Critical: in 7.2 the upstream build sets ROCM_USE_FLOAT16 only + # via the in-tree hipblaslt target's INTERFACE_COMPILE_DEFINITIONS. + # The system find_package(hipblaslt) does not propagate it. Without + # this, hipblasLtHalf is the struct version with no operator float, + # which breaks hipblaslt_ostream.hpp. + ROCM_USE_FLOAT16 + __HIP_PLATFORM_AMD__ + HIPBLASLT_BENCH + HIPBLASLT_INTERNAL_API +) + +target_link_libraries(hipblaslt-clients-common + PUBLIC + hip::host + hip::device + ${LAPACK_LIBRARIES} + OpenMP::OpenMP_CXX +) + +if(rocm_smi_FOUND) + target_link_libraries(hipblaslt-clients-common PRIVATE rocm_smi64) +endif() + +# Link against the system hipblaslt .so directly via library name to avoid +# inheriting INTERFACE_COMPILE_DEFINITIONS (HIPBLASLT_USE_ROCROLLER) from +# the imported roc::hipblaslt target. We only need linkage, not propagated +# defines. +target_link_directories(hipblaslt-clients-common PUBLIC /opt/rocm/lib) +target_link_libraries(hipblaslt-clients-common PUBLIC hipblaslt) + +# --- The bench executable --- +add_executable(hipblaslt-bench + "${HIPBLASLT_SRC}/clients/bench/src/client.cpp" +) +set_source_files_properties( + "${HIPBLASLT_SRC}/clients/bench/src/client.cpp" + PROPERTIES LANGUAGE HIP +) +target_link_libraries(hipblaslt-bench PRIVATE hipblaslt-clients-common) diff --git a/dockerfile/rocm7.2.x.dockerfile b/dockerfile/rocm7.2.x.dockerfile index 64dd4be7f..844a65e5c 100644 --- a/dockerfile/rocm7.2.x.dockerfile +++ b/dockerfile/rocm7.2.x.dockerfile @@ -165,38 +165,45 @@ ADD third_party third_party # rocm_megatron_lm: broken upstream (pretrain_deepseek.py missing in rocm_dev branch) # apex_rocm: skipped — all imports guarded, PyTorch 2.9 has native fused optimizers/AMP. RUN make RCCL_HOME=/opt/rccl/build/ ROCBLAS_BRANCH=release/rocm-rel-7.2 HIPBLASLT_BRANCH=release/rocm-rel-7.2 ROCM_VER=rocm-5.5.0 -C third_party rocm -o cpu_hpl -o cpu_stream -o megatron_lm -o rocm_hipblaslt -o rocm_megatron_lm -o apex_rocm -# Build hipblaslt-bench only (not the library) against system-installed hipBLASLt. -# Origami is AMD-internal; sed it out. ROCROLLER disabled via flag, mxDataGenerator removed. +# Build hipblaslt-bench using our standalone CMake. The upstream 7.2 build +# system pulls in AMD-internal "origami" headers and a new tensilelite-host +# C++ library that is broken under our system (link to system libhipblaslt +# at runtime instead of building the full library). Our standalone CMake +# compiles only the bench sources against system hipblaslt + LAPACK. +COPY dockerfile/etc/hipblaslt-bench-standalone.cmake /tmp/hipblaslt-bench-standalone.cmake RUN cd third_party && \ git clone --depth 1 -b release/rocm-rel-7.2 https://github.com/ROCmSoftwarePlatform/hipBLASLt.git && \ + cp /tmp/hipblaslt-bench-standalone.cmake hipBLASLt/CMakeLists.txt && \ cd hipBLASLt && \ - sed -i '/origami/d' CMakeLists.txt tensilelite/CMakeLists.txt && \ - sed -i '/mxdatagenerator\|mxDataGenerator/d' clients/CMakeLists.txt && \ - # Pre-build the cblas/lapack dependency (normally done by ./install.sh -d). + # Pre-build cblas/lapack into /usr/local using the upstream deps script. + # The deps superbuild builds but does not install lapack; install both + # gtest and lapack explicitly so find_package(BLAS)/find_package(LAPACK) + # can locate /usr/local/lib/{liblapack.a,libcblas.a,libblas.a}. mkdir -p deps/build && cd deps/build && \ - CMAKE_POLICY_VERSION_MINIMUM=3.5 cmake .. && make -j$(nproc) && cd ../.. && \ - mkdir -p build/release && cd build/release && \ + CMAKE_POLICY_VERSION_MINIMUM=3.5 cmake .. && \ + cmake --build . -j$(nproc) --target lapack && \ + cmake --build lapack/src/lapack-build -j$(nproc) --target install && \ + cd ../.. && \ + mkdir -p build && cd build && \ CMAKE_POLICY_VERSION_MINIMUM= cmake \ - -DHIPBLASLT_ENABLE_HOST=OFF \ - -DHIPBLASLT_ENABLE_DEVICE=OFF \ - -DHIPBLASLT_ENABLE_CLIENT=ON \ - -DHIPBLASLT_ENABLE_ROCROLLER=OFF \ - -DHIPBLASLT_BUILD_TESTING=OFF \ - -DHIPBLASLT_ENABLE_SAMPLES=OFF \ - -DHIPBLASLT_ENABLE_LLVM=OFF \ + -DCMAKE_CXX_COMPILER=/opt/rocm/llvm/bin/clang++ \ + -DCMAKE_HIP_COMPILER=/opt/rocm/llvm/bin/clang++ \ + -DCMAKE_HIP_ARCHITECTURES=gfx942 \ -DCMAKE_PREFIX_PATH="/opt/rocm;/usr/local" \ -DCMAKE_BUILD_TYPE=Release \ - ../.. && \ + .. && \ make -j$(nproc) hipblaslt-bench && \ - cp -v clients/hipblaslt-bench /opt/superbench/bin/ + cp -v hipblaslt-bench /opt/superbench/bin/ RUN cd third_party/Megatron/Megatron-DeepSpeed && \ git apply ../megatron_deepspeed_rocm6.patch -# Install TransformerEngine — ROCm 7.2 has hip_fp4.h and gfx950 support, -# so we can use the latest dev branch with full CK + aotriton fused attention. +# Install TransformerEngine — use AOTriton-only fused attention to avoid the +# CK + AITER build chain (validated working on ROCm 7.0; same TE upstream). RUN git clone --recursive https://github.com/ROCm/TransformerEngine.git && \ cd TransformerEngine && \ NVTE_FRAMEWORK=pytorch \ + NVTE_FUSED_ATTN_CK=0 \ + NVTE_FUSED_ATTN_AOTRITON=1 \ NVTE_ROCM_ARCH="gfx942;gfx950" \ python3 setup.py install RUN python3 -c "import transformer_engine.pytorch; print('TE installed successfully')" From de5f8b0e5c2e181bdaf746c600dce9d518235ae3 Mon Sep 17 00:00:00 2001 From: Zheyu Shen Date: Thu, 30 Apr 2026 23:30:40 +0000 Subject: [PATCH 15/20] more ROCm 7.2 workaround Co-authored-by: Copilot --- dockerfile/etc/hipblaslt-bench-standalone.cmake | 12 ++++++++++++ dockerfile/rocm7.2.x.dockerfile | 2 ++ 2 files changed, 14 insertions(+) diff --git a/dockerfile/etc/hipblaslt-bench-standalone.cmake b/dockerfile/etc/hipblaslt-bench-standalone.cmake index 33d48daa7..79bf8d35d 100644 --- a/dockerfile/etc/hipblaslt-bench-standalone.cmake +++ b/dockerfile/etc/hipblaslt-bench-standalone.cmake @@ -30,6 +30,11 @@ find_package(LAPACK REQUIRED) # also brings BLAS via implicit find_package(BLAS find_package(OpenMP REQUIRED) find_package(rocm_smi) # optional +# Locate cblas explicitly (not part of LAPACK's standard targets). +# cblas_interface.cpp uses cblas_sgemm/dgemm so we need the C BLAS library. +find_library(CBLAS_LIBRARY NAMES cblas PATHS /usr/local/lib /usr/lib REQUIRED) +message(STATUS "Found CBLAS: ${CBLAS_LIBRARY}") + # --- The bench static helper library --- add_library(hipblaslt-clients-common STATIC "${HIPBLASLT_SRC}/clients/common/src/singletons.cpp" @@ -69,6 +74,8 @@ target_include_directories(hipblaslt-clients-common "${HIPBLASLT_SRC}/library/src/amd_detail/include" "${HIPBLASLT_SRC}/library/src/amd_detail/rocblaslt/include" "${HIPBLASLT_SRC}/library/src/amd_detail/rocblaslt/src/include" + # tensilelite headers used by clients (e.g. client/include/Utility.hpp). + "${HIPBLASLT_SRC}/tensilelite" ) target_compile_definitions(hipblaslt-clients-common @@ -88,7 +95,12 @@ target_link_libraries(hipblaslt-clients-common PUBLIC hip::host hip::device + # Order matters: cblas -> lapack -> blas -> gfortran (lapack needs blas + # which needs Fortran runtime). + ${CBLAS_LIBRARY} ${LAPACK_LIBRARIES} + ${BLAS_LIBRARIES} + gfortran OpenMP::OpenMP_CXX ) diff --git a/dockerfile/rocm7.2.x.dockerfile b/dockerfile/rocm7.2.x.dockerfile index 844a65e5c..d997595c8 100644 --- a/dockerfile/rocm7.2.x.dockerfile +++ b/dockerfile/rocm7.2.x.dockerfile @@ -190,6 +190,8 @@ RUN cd third_party && \ -DCMAKE_HIP_COMPILER=/opt/rocm/llvm/bin/clang++ \ -DCMAKE_HIP_ARCHITECTURES=gfx942 \ -DCMAKE_PREFIX_PATH="/opt/rocm;/usr/local" \ + -DBLAS_LIBRARIES=/usr/local/lib/libblas.a \ + -DLAPACK_LIBRARIES=/usr/local/lib/liblapack.a \ -DCMAKE_BUILD_TYPE=Release \ .. && \ make -j$(nproc) hipblaslt-bench && \ From 1e09f0acee61845a074de0e1e1b4d8a301fdba25 Mon Sep 17 00:00:00 2001 From: Zheyu Shen Date: Fri, 1 May 2026 00:52:11 +0000 Subject: [PATCH 16/20] fix Co-authored-by: Copilot --- dockerfile/rocm7.2.x.dockerfile | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/dockerfile/rocm7.2.x.dockerfile b/dockerfile/rocm7.2.x.dockerfile index d997595c8..0aef2e0d4 100644 --- a/dockerfile/rocm7.2.x.dockerfile +++ b/dockerfile/rocm7.2.x.dockerfile @@ -201,7 +201,10 @@ RUN cd third_party/Megatron/Megatron-DeepSpeed && \ # Install TransformerEngine — use AOTriton-only fused attention to avoid the # CK + AITER build chain (validated working on ROCm 7.0; same TE upstream). -RUN git clone --recursive https://github.com/ROCm/TransformerEngine.git && \ +# onnxscript is now an unconditional import in TE main (export.py); install it +# explicitly because TE's setup.py does not list it as a dependency. +RUN python3 -m pip install onnxscript && \ + git clone --recursive https://github.com/ROCm/TransformerEngine.git && \ cd TransformerEngine && \ NVTE_FRAMEWORK=pytorch \ NVTE_FUSED_ATTN_CK=0 \ From 1d15395873192b12e65dd20ead71256e06231685 Mon Sep 17 00:00:00 2001 From: Zheyu Shen Date: Fri, 1 May 2026 08:06:49 +0000 Subject: [PATCH 17/20] reintroduce gpu_stream for ROCm Co-authored-by: Copilot --- .../benchmarks/micro_benchmarks/gpu_stream.py | 1 + .../gpu_stream/CMakeLists.txt | 116 +++++++++++++++--- .../micro_benchmarks/gpu_stream/gpu_stream.cu | 51 ++++++-- .../gpu_stream/gpu_stream_utils.hpp | 4 + 4 files changed, 147 insertions(+), 25 deletions(-) diff --git a/superbench/benchmarks/micro_benchmarks/gpu_stream.py b/superbench/benchmarks/micro_benchmarks/gpu_stream.py index 2e82262f3..43e40c065 100644 --- a/superbench/benchmarks/micro_benchmarks/gpu_stream.py +++ b/superbench/benchmarks/micro_benchmarks/gpu_stream.py @@ -117,3 +117,4 @@ def _process_raw_result(self, cmd_idx, raw_output): BenchmarkRegistry.register_benchmark('gpu-stream', GpuStreamBenchmark, platform=Platform.CUDA) +BenchmarkRegistry.register_benchmark('gpu-stream', GpuStreamBenchmark, platform=Platform.ROCM) diff --git a/superbench/benchmarks/micro_benchmarks/gpu_stream/CMakeLists.txt b/superbench/benchmarks/micro_benchmarks/gpu_stream/CMakeLists.txt index ba3d2750b..897911c19 100644 --- a/superbench/benchmarks/micro_benchmarks/gpu_stream/CMakeLists.txt +++ b/superbench/benchmarks/micro_benchmarks/gpu_stream/CMakeLists.txt @@ -11,34 +11,114 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CUDA_STANDARD 17) set(CMAKE_CUDA_STANDARD_REQUIRED ON) +# Source files (CUDA-style names; hipify-perl translates them on ROCm path) +set(GPU_STREAM_SOURCES + gpu_stream_test.cpp + gpu_stream_utils.cpp + gpu_stream.cu + gpu_stream_kernels.cu +) +set(GPU_STREAM_HEADERS + gpu_stream.hpp + gpu_stream_utils.hpp + gpu_stream_kernels.hpp +) + find_package(CUDAToolkit QUIET) -if(NOT CUDAToolkit_FOUND) - message(WARNING "gpu_stream: CUDA not found, skipping build (requires NVIDIA GPU with NVML)") +if(CUDAToolkit_FOUND) + # NVIDIA / CUDA path + find_library(NVML_LIBRARY nvidia-ml PATHS ${CUDAToolkit_LIBRARY_DIR} PATH_SUFFIXES stubs) + if(NOT NVML_LIBRARY) + message(WARNING "gpu_stream: NVML (nvidia-ml) not found, skipping build") + return() + endif() + + message(STATUS "Found CUDA: ${CUDAToolkit_VERSION}") + include(../cuda_common.cmake) + add_executable(gpu_stream ${GPU_STREAM_SOURCES}) + set_property(TARGET gpu_stream PROPERTY CUDA_ARCHITECTURES ${NVCC_ARCHS_SUPPORTED}) + target_include_directories(gpu_stream PRIVATE ${CUDAToolkit_INCLUDE_DIRS}) + target_link_libraries(gpu_stream numa ${NVML_LIBRARY}) + install(TARGETS gpu_stream RUNTIME DESTINATION bin) return() endif() -# Check for NVML (nvidia-ml) library, required for querying memory clock rates -find_library(NVML_LIBRARY nvidia-ml PATHS ${CUDAToolkit_LIBRARY_DIR} PATH_SUFFIXES stubs) -if(NOT NVML_LIBRARY) - message(WARNING "gpu_stream: NVML (nvidia-ml) not found, skipping build") +# ROCm / HIP path +include(../rocm_common.cmake) +find_package(hip QUIET) + +if(NOT hip_FOUND) + message(WARNING "gpu_stream: neither CUDA nor HIP found, skipping build") return() endif() -message(STATUS "Found CUDA: " ${CUDAToolkit_VERSION}) +message(STATUS "Found ROCm: ${HIP_VERSION}") +enable_language(HIP) -# Source files -set(SOURCES - gpu_stream_test.cpp - gpu_stream_utils.cpp - gpu_stream.cu - gpu_stream_kernels.cu +# rocm_smi is required for actual memory clock queries (NVML equivalent). +find_path(ROCM_SMI_INCLUDE_DIR rocm_smi/rocm_smi.h + HINTS ${ROCM_PATH} /opt/rocm + PATH_SUFFIXES include ) +find_library(ROCM_SMI_LIBRARY + NAMES rocm_smi64 + HINTS ${ROCM_PATH} /opt/rocm + PATH_SUFFIXES lib lib64 +) +if(NOT ROCM_SMI_INCLUDE_DIR OR NOT ROCM_SMI_LIBRARY) + message(WARNING "gpu_stream: rocm_smi not found, skipping build") + return() +endif() +message(STATUS "Found rocm_smi: ${ROCM_SMI_LIBRARY}") + +# Translate the CUDA-named sources/headers to HIP via hipify-perl into the +# build directory. hipify-perl translates cuda* identifiers and CUDA includes +# but leaves NVML/rocm_smi symbols untouched (they are gated by #ifdef in the +# source). +find_program(HIPIFY_PERL hipify-perl REQUIRED) + +set(HIPIFY_OUTPUT_DIR ${CMAKE_CURRENT_BINARY_DIR}/hipified) +file(MAKE_DIRECTORY ${HIPIFY_OUTPUT_DIR}) + +set(HIP_SOURCES "") +set(HIPIFY_OUTPUTS "") +foreach(SRC IN LISTS GPU_STREAM_SOURCES GPU_STREAM_HEADERS) + set(SRC_IN ${CMAKE_CURRENT_SOURCE_DIR}/${SRC}) + set(SRC_OUT ${HIPIFY_OUTPUT_DIR}/${SRC}) + add_custom_command( + OUTPUT ${SRC_OUT} + COMMAND ${HIPIFY_PERL} -o ${SRC_OUT} ${SRC_IN} + DEPENDS ${SRC_IN} + COMMENT "hipify-perl ${SRC}" + VERBATIM + ) + list(APPEND HIPIFY_OUTPUTS ${SRC_OUT}) + if(SRC MATCHES "\\.(cpp|cu)$") + list(APPEND HIP_SOURCES ${SRC_OUT}) + endif() +endforeach() -include(../cuda_common.cmake) -add_executable(gpu_stream ${SOURCES}) -set_property(TARGET gpu_stream PROPERTY CUDA_ARCHITECTURES ${NVCC_ARCHS_SUPPORTED}) -target_include_directories(gpu_stream PRIVATE ${CUDAToolkit_INCLUDE_DIRS}) -target_link_libraries(gpu_stream numa ${NVML_LIBRARY}) +# Aggregate target so all hipified outputs (including headers) are produced +# before any object is compiled. +add_custom_target(gpu_stream_hipify DEPENDS ${HIPIFY_OUTPUTS}) + +set_source_files_properties(${HIP_SOURCES} PROPERTIES LANGUAGE HIP) + +include(CheckSymbolExists) +check_symbol_exists("hipDeviceMallocUncached" "hip/hip_runtime_api.h" HIP_UNCACHED_MEMORY) +if(HIP_UNCACHED_MEMORY) + add_compile_definitions(HIP_UNCACHED_MEMORY) +endif() + +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2") + +add_executable(gpu_stream ${HIP_SOURCES}) +add_dependencies(gpu_stream gpu_stream_hipify) +target_include_directories(gpu_stream PRIVATE + ${HIPIFY_OUTPUT_DIR} + ${ROCM_SMI_INCLUDE_DIR} +) +target_link_libraries(gpu_stream numa hip::device ${ROCM_SMI_LIBRARY}) install(TARGETS gpu_stream RUNTIME DESTINATION bin) diff --git a/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.cu b/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.cu index 617b8338a..64b8c5d92 100644 --- a/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.cu +++ b/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.cu @@ -9,7 +9,11 @@ #include "gpu_stream.hpp" #include #include +#if defined(__HIP_PLATFORM_AMD__) +#include +#else #include +#endif /** * @brief Destroys the CUDA events used for benchmarking. @@ -105,14 +109,16 @@ template int GpuStream::Destroy(std::unique_ptr> &args } /** - * @brief Gets the memory clock rate for a CUDA device. + * @brief Gets the memory clock rate for a GPU device. * - * @details This function gets the memory clock rate using the appropriate method - * based on CUDA version: CUDA 12.0+ uses NVML and cudaDeviceGetAttribute as a fallback; - * older CUDA versions use cudaDeviceProp. + * @details This function gets the memory clock rate using the appropriate method: + * - On NVIDIA with CUDA 12.0+: NVML for actual clock rate, falling back to + * cudaDeviceGetAttribute. Older CUDA versions use cudaDeviceProp.memoryClockRate. + * - On AMD: rocm_smi for actual clock rate, falling back to + * hipDeviceProp_t.memoryClockRate. * - * @param[in] device_id The ID of the CUDA device. - * @param[in] prop The properties of the CUDA device. + * @param[in] device_id The ID of the GPU device. + * @param[in] prop The properties of the GPU device. * @return float The memory clock rate in MHz, or -1.0f if retrieval fails. */ float GpuStream::GetMemoryClockRate(int device_id, const cudaDeviceProp &prop) { @@ -123,7 +129,15 @@ float GpuStream::GetMemoryClockRate(int device_id, const cudaDeviceProp &prop) { return -1.0f; } -#if CUDA_VERSION >= 12000 +#if defined(__HIP_PLATFORM_AMD__) + // ROCm: query actual memory clock rate via rocm_smi + memory_clock_mhz = GetActualMemoryClockRate(device_id); + + // If rocm_smi fails, fall back to prop.memoryClockRate (kHz) + if (memory_clock_mhz < 0.0f) { + memory_clock_mhz = prop.memoryClockRate / 1000.0f; + } +#elif CUDA_VERSION >= 12000 // For CUDA 12.0+, first try NVML for actual clock rate memory_clock_mhz = GetActualMemoryClockRate(device_id); @@ -511,6 +525,28 @@ int GpuStream::RunStreamKernel(std::unique_ptr> &args, Kernel kerne } float GpuStream::GetActualMemoryClockRate(int gpu_id) { +#if defined(__HIP_PLATFORM_AMD__) + // ROCm: query actual memory clock via rocm_smi. + // rsmi_dev_gpu_clk_freq_get returns frequencies in Hz; convert to MHz. + rsmi_status_t ret = rsmi_init(0); + if (ret != RSMI_STATUS_SUCCESS) { + std::cerr << "Failed to initialize ROCm SMI: status=" << ret << std::endl; + return -1.0f; + } + + rsmi_frequencies_t freq{}; + ret = rsmi_dev_gpu_clk_freq_get(static_cast(gpu_id), RSMI_CLK_TYPE_MEM, &freq); + if (ret != RSMI_STATUS_SUCCESS) { + std::cerr << "Failed to get memory clock from ROCm SMI: status=" << ret << std::endl; + rsmi_shut_down(); + return -1.0f; + } + + // freq.current is the index of the active frequency level; values are in Hz. + float clock_mhz = static_cast(freq.frequency[freq.current]) / 1.0e6f; + rsmi_shut_down(); + return clock_mhz; +#else nvmlReturn_t result; nvmlDevice_t device; unsigned int clock_mhz; @@ -540,6 +576,7 @@ float GpuStream::GetActualMemoryClockRate(int gpu_id) { nvmlShutdown(); return static_cast(clock_mhz); +#endif } /** diff --git a/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream_utils.hpp b/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream_utils.hpp index 0c648514b..ab03915fb 100644 --- a/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream_utils.hpp +++ b/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream_utils.hpp @@ -15,7 +15,11 @@ #include #include #include +#if defined(__HIP_PLATFORM_AMD__) +#include +#else #include +#endif // Custom deleter for GPU buffers struct GpuBufferDeleter { From 873439d3cb7e29e701d283ee1df4eeea274d58a5 Mon Sep 17 00:00:00 2001 From: Zheyu Shen Date: Fri, 1 May 2026 08:14:27 +0000 Subject: [PATCH 18/20] add CI Co-authored-by: Copilot --- .github/workflows/build-image.yml | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/.github/workflows/build-image.yml b/.github/workflows/build-image.yml index bf809cd43..cd98ea51d 100644 --- a/.github/workflows/build-image.yml +++ b/.github/workflows/build-image.yml @@ -80,6 +80,24 @@ jobs: platforms: linux/amd64 runner: ubuntu-latest build_args: "NUM_MAKE_JOBS=8" + - name: rocm6.4 + dockerfile: rocm6.4.x + tags: superbench/main:rocm6.4 + platforms: linux/amd64 + runner: [self-hosted, linux/amd64, rocm] + build_args: "NUM_MAKE_JOBS=16" + - name: rocm7.0 + dockerfile: rocm7.0.x + tags: superbench/main:rocm7.0 + platforms: linux/amd64 + runner: [self-hosted, linux/amd64, rocm] + build_args: "NUM_MAKE_JOBS=16" + - name: rocm7.2 + dockerfile: rocm7.2.x + tags: superbench/main:rocm7.2 + platforms: linux/amd64 + runner: [self-hosted, linux/amd64, rocm] + build_args: "NUM_MAKE_JOBS=16" # - name: rocm6.2 # dockerfile: rocm6.2.x # tags: superbench/main:rocm6.2 From 565262b97fff591ee419fb8a015f6ef9d1f01217 Mon Sep 17 00:00:00 2001 From: Zheyu Shen Date: Mon, 4 May 2026 12:31:55 -0700 Subject: [PATCH 19/20] Update superbench/benchmarks/micro_benchmarks/hipblaslt_function.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- superbench/benchmarks/micro_benchmarks/hipblaslt_function.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/superbench/benchmarks/micro_benchmarks/hipblaslt_function.py b/superbench/benchmarks/micro_benchmarks/hipblaslt_function.py index 16220b92e..47a097e0c 100644 --- a/superbench/benchmarks/micro_benchmarks/hipblaslt_function.py +++ b/superbench/benchmarks/micro_benchmarks/hipblaslt_function.py @@ -121,8 +121,10 @@ def _process_raw_result(self, cmd_idx, raw_output): header_idx = i break - if header_idx is None or header_idx + 1 >= len(lines): + if header_idx is None: raise ValueError('Header line with "hipblaslt-Gflops" not found in the log.') + if header_idx + 1 >= len(lines): + raise ValueError('Data row after header line with "hipblaslt-Gflops" not found in the log.') # Parse header. The first column may carry a "[N]" or "[N]:" prefix # (e.g. "[0]:transA" or "[0]transA"); strip it so column names match. From 846e7bc59b2ede1a0b7a7f0a7a90367718a74c16 Mon Sep 17 00:00:00 2001 From: Zheyu Shen Date: Tue, 5 May 2026 00:12:12 +0000 Subject: [PATCH 20/20] fix documentation comments --- dockerfile/etc/hipblaslt-bench-standalone.cmake | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/dockerfile/etc/hipblaslt-bench-standalone.cmake b/dockerfile/etc/hipblaslt-bench-standalone.cmake index 79bf8d35d..244d4786a 100644 --- a/dockerfile/etc/hipblaslt-bench-standalone.cmake +++ b/dockerfile/etc/hipblaslt-bench-standalone.cmake @@ -5,12 +5,12 @@ # tree pulls in AMD-internal "origami" headers and a new tensilelite-host # C++ library that conflict with the goal of building only the bench tool. # -# Place this file at the root of an upstream hipBLASLt source tree -# (e.g. cp this to /path/to/hipBLASLt/CMakeLists-bench.txt) and invoke: +# Place this file at the root of an upstream hipBLASLt source tree as the +# top-level CMakeLists.txt and configure it as a normal CMake project, e.g.: # -# cmake -B build -S /path/to/hipBLASLt -P /path/to/this/file -# -# Or use it as the top-level CMakeLists.txt by overwriting it. +# cp /path/to/this/file /path/to/hipBLASLt/CMakeLists.txt +# cmake -S /path/to/hipBLASLt -B /path/to/hipBLASLt/build +# cmake --build /path/to/hipBLASLt/build --target hipblaslt-bench cmake_minimum_required(VERSION 3.21) project(hipblaslt-bench-standalone LANGUAGES CXX HIP)