Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions .github/workflows/build-image.yml
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,24 @@ jobs:
platforms: linux/amd64
runner: ubuntu-latest
build_args: "NUM_MAKE_JOBS=8"
- name: rocm6.4
dockerfile: rocm6.4.x
tags: superbench/main:rocm6.4
platforms: linux/amd64
runner: [self-hosted, linux/amd64, rocm]
build_args: "NUM_MAKE_JOBS=16"
- name: rocm7.0
dockerfile: rocm7.0.x
tags: superbench/main:rocm7.0
platforms: linux/amd64
runner: [self-hosted, linux/amd64, rocm]
build_args: "NUM_MAKE_JOBS=16"
- name: rocm7.2
dockerfile: rocm7.2.x
tags: superbench/main:rocm7.2
platforms: linux/amd64
runner: [self-hosted, linux/amd64, rocm]
build_args: "NUM_MAKE_JOBS=16"
# - name: rocm6.2
# dockerfile: rocm6.2.x
# tags: superbench/main:rocm6.2
Expand Down
126 changes: 126 additions & 0 deletions dockerfile/etc/hipblaslt-bench-standalone.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
# Standalone CMake for building hipblaslt-bench against system-installed
# hipBLASLt, bypassing the upstream build system.
#
# Used by dockerfile/rocm7.2.x.dockerfile because the upstream 7.2 source
# tree pulls in AMD-internal "origami" headers and a new tensilelite-host
# C++ library that conflict with the goal of building only the bench tool.
#
# Place this file at the root of an upstream hipBLASLt source tree as the
# top-level CMakeLists.txt and configure it as a normal CMake project, e.g.:
#
# cp /path/to/this/file /path/to/hipBLASLt/CMakeLists.txt
# cmake -S /path/to/hipBLASLt -B /path/to/hipBLASLt/build
# cmake --build /path/to/hipBLASLt/build --target hipblaslt-bench

cmake_minimum_required(VERSION 3.21)
project(hipblaslt-bench-standalone LANGUAGES CXX HIP)

set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_HIP_STANDARD 17)
set(CMAKE_POSITION_INDEPENDENT_CODE ON)

# The directory containing this build script is the hipBLASLt source root.
set(HIPBLASLT_SRC "${CMAKE_CURRENT_SOURCE_DIR}")

# --- Dependencies (all from system / preinstalled) ---
find_package(hip REQUIRED)
find_package(hipblaslt CONFIG REQUIRED)
find_package(LAPACK REQUIRED) # also brings BLAS via implicit find_package(BLAS)
find_package(OpenMP REQUIRED)
find_package(rocm_smi) # optional

# Locate cblas explicitly (not part of LAPACK's standard targets).
# cblas_interface.cpp uses cblas_sgemm/dgemm so we need the C BLAS library.
find_library(CBLAS_LIBRARY NAMES cblas PATHS /usr/local/lib /usr/lib REQUIRED)
message(STATUS "Found CBLAS: ${CBLAS_LIBRARY}")

# --- The bench static helper library ---
add_library(hipblaslt-clients-common STATIC
"${HIPBLASLT_SRC}/clients/common/src/singletons.cpp"
"${HIPBLASLT_SRC}/clients/common/src/utility.cpp"
"${HIPBLASLT_SRC}/clients/common/src/efficiency_monitor.cpp"
"${HIPBLASLT_SRC}/clients/common/src/cblas_interface.cpp"
"${HIPBLASLT_SRC}/clients/common/src/argument_model.cpp"
"${HIPBLASLT_SRC}/clients/common/src/hipblaslt_parse_data.cpp"
"${HIPBLASLT_SRC}/clients/common/src/hipblaslt_arguments.cpp"
"${HIPBLASLT_SRC}/clients/common/src/hipblaslt_random.cpp"
"${HIPBLASLT_SRC}/clients/common/src/hipblaslt_init_device.cpp"
)

# These .cpp files are HIP code (use __device__/__host__, hip_runtime APIs,
# half/bfloat16 types). Compiling them as plain CXX with gcc fails. Force HIP.
set_source_files_properties(
"${HIPBLASLT_SRC}/clients/common/src/utility.cpp"
"${HIPBLASLT_SRC}/clients/common/src/cblas_interface.cpp"
"${HIPBLASLT_SRC}/clients/common/src/hipblaslt_init_device.cpp"
"${HIPBLASLT_SRC}/clients/common/src/hipblaslt_arguments.cpp"
"${HIPBLASLT_SRC}/clients/common/src/hipblaslt_random.cpp"
"${HIPBLASLT_SRC}/clients/common/src/argument_model.cpp"
"${HIPBLASLT_SRC}/clients/common/src/hipblaslt_parse_data.cpp"
"${HIPBLASLT_SRC}/clients/common/src/efficiency_monitor.cpp"
"${HIPBLASLT_SRC}/clients/common/src/singletons.cpp"
PROPERTIES LANGUAGE HIP
)

target_include_directories(hipblaslt-clients-common
PUBLIC
"${HIPBLASLT_SRC}/clients/common/include"
"${HIPBLASLT_SRC}/clients/bench/include"
# Source's library/include comes BEFORE system include so we get
# hipblaslt_xfloat32.h (not shipped in the system install).
"${HIPBLASLT_SRC}/library/include"
# Internal headers used by clients (rocblaslt/rocblaslt-types.h etc.)
"${HIPBLASLT_SRC}/library/src/amd_detail/include"
"${HIPBLASLT_SRC}/library/src/amd_detail/rocblaslt/include"
"${HIPBLASLT_SRC}/library/src/amd_detail/rocblaslt/src/include"
# tensilelite headers used by clients (e.g. client/include/Utility.hpp).
"${HIPBLASLT_SRC}/tensilelite"
)

target_compile_definitions(hipblaslt-clients-common
PUBLIC
# Critical: in 7.2 the upstream build sets ROCM_USE_FLOAT16 only
# via the in-tree hipblaslt target's INTERFACE_COMPILE_DEFINITIONS.
# The system find_package(hipblaslt) does not propagate it. Without
# this, hipblasLtHalf is the struct version with no operator float,
# which breaks hipblaslt_ostream.hpp.
ROCM_USE_FLOAT16
__HIP_PLATFORM_AMD__
HIPBLASLT_BENCH
HIPBLASLT_INTERNAL_API
)

target_link_libraries(hipblaslt-clients-common
PUBLIC
hip::host
hip::device
# Order matters: cblas -> lapack -> blas -> gfortran (lapack needs blas
# which needs Fortran runtime).
${CBLAS_LIBRARY}
${LAPACK_LIBRARIES}
${BLAS_LIBRARIES}
gfortran
OpenMP::OpenMP_CXX
)

if(rocm_smi_FOUND)
target_link_libraries(hipblaslt-clients-common PRIVATE rocm_smi64)
endif()

# Link against the system hipblaslt .so directly via library name to avoid
# inheriting INTERFACE_COMPILE_DEFINITIONS (HIPBLASLT_USE_ROCROLLER) from
# the imported roc::hipblaslt target. We only need linkage, not propagated
# defines.
target_link_directories(hipblaslt-clients-common PUBLIC /opt/rocm/lib)
target_link_libraries(hipblaslt-clients-common PUBLIC hipblaslt)

# --- The bench executable ---
add_executable(hipblaslt-bench
"${HIPBLASLT_SRC}/clients/bench/src/client.cpp"
)
set_source_files_properties(
"${HIPBLASLT_SRC}/clients/bench/src/client.cpp"
PROPERTIES LANGUAGE HIP
)
target_link_libraries(hipblaslt-bench PRIVATE hipblaslt-clients-common)
2 changes: 1 addition & 1 deletion dockerfile/rocm6.2.x.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ RUN cd /tmp && \

# Install Intel MLC
RUN cd /tmp && \
wget -q https://downloadmirror.intel.com/763324/mlc_v3.10.tgz -O mlc.tgz && \
wget -q https://downloadmirror.intel.com/866182/mlc_v3.12.tgz -O mlc.tgz && \
tar xzf mlc.tgz Linux/mlc && \
cp ./Linux/mlc /usr/local/bin/ && \
rm -rf ./Linux mlc.tgz
Expand Down
215 changes: 215 additions & 0 deletions dockerfile/rocm6.4.x.dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,215 @@
ARG BASE_IMAGE=rocm/pytorch:rocm6.4.4_ubuntu24.04_py3.12_pytorch_release_2.7.1

FROM ${BASE_IMAGE}

# OS:
# - Ubuntu: 24.04
# - Docker Client: 20.10.8
# ROCm:
# - ROCm: 6.4
# Lib:
# - torch: 2.7.1
# - rccl: release/rocm-rel-6.4
# - hipblaslt: release-staging/rocm-rel-6.4
# - rocblas: release-staging/rocm-rel-6.4
# - openmpi: 4.1.x
# Intel:
# - mlc: v3.12

LABEL maintainer="SuperBench"

ENV DEBIAN_FRONTEND=noninteractive
RUN apt-get update && \
apt-get -q install -y --no-install-recommends \
autoconf \
automake \
bc \
build-essential \
curl \
dmidecode \
git \
hipify-clang \
iproute2 \
jq \
libaio-dev \
libboost-program-options-dev \
libcap2 \
libcurl4-openssl-dev \
libnuma-dev \
libpci-dev \
libssl-dev \
libtinfo6 \
libtool \
lshw \
net-tools \
numactl \
openssh-client \
openssh-server \
pciutils \
python3-mpi4py \
rsync \
sudo \
util-linux \
vim \
wget \
&& \
rm -rf /tmp/*

ARG NUM_MAKE_JOBS=64

# Check if CMake is installed and its version
RUN cmake_version=$(cmake --version 2>/dev/null | grep -oP "(?<=cmake version )(\d+\.\d+)" || echo "0.0") && \
required_version="3.24.1" && \
if [ "$(printf "%s\n" "$required_version" "$cmake_version" | sort -V | head -n 1)" != "$required_version" ]; then \
echo "existing cmake version is ${cmake_version}" && \
cd /tmp && \
wget -q https://github.com/Kitware/CMake/releases/download/v${required_version}/cmake-${required_version}.tar.gz && \
tar xzf cmake-${required_version}.tar.gz && \
cd cmake-${required_version} && \
./bootstrap --prefix=/usr --no-system-curl --parallel=16 && \
make -j ${NUM_MAKE_JOBS} && \
make install && \
rm -rf /tmp/cmake-${required_version}* \
else \
echo "CMake version ${cmake_version} is greater than or equal to ${required_version}"; \
fi

# Install Docker
ENV DOCKER_VERSION=20.10.8
RUN cd /tmp && \
wget -q https://download.docker.com/linux/static/stable/x86_64/docker-${DOCKER_VERSION}.tgz -O docker.tgz && \
tar --extract --file docker.tgz --strip-components 1 --directory /usr/local/bin/ && \
rm docker.tgz

# Update system config
RUN mkdir -p /root/.ssh && \
touch /root/.ssh/authorized_keys && \
mkdir -p /var/run/sshd && \
sed -i "s/[# ]*PermitRootLogin prohibit-password/PermitRootLogin yes/" /etc/ssh/sshd_config && \
sed -i "s/[# ]*PermitUserEnvironment no/PermitUserEnvironment yes/" /etc/ssh/sshd_config && \
sed -i "s/[# ]*Port.*/Port 22/" /etc/ssh/sshd_config && \
echo "* soft nofile 1048576\n* hard nofile 1048576" >> /etc/security/limits.conf && \
echo "root soft nofile 1048576\nroot hard nofile 1048576" >> /etc/security/limits.conf


# Set Ubuntu version
ENV UBUNTU_VERSION=24.04

# Install OFED
ENV OFED_VERSION=24.10-1.1.4.0
# Check if ofed_info is present and has a version
RUN if ! command -v ofed_info >/dev/null 2>&1; then \
echo "OFED not found. Installing OFED..."; \
cd /tmp && \
wget -q http://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu${UBUNTU_VERSION}-x86_64.tgz && \
tar xzf MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu${UBUNTU_VERSION}-x86_64.tgz && \
PATH=/usr/bin:${PATH} MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu${UBUNTU_VERSION}-x86_64/mlnxofedinstall --user-space-only --without-fw-update --force --all && \
rm -rf MLNX_OFED_LINUX-${OFED_VERSION}* ; \
fi

ENV ROCM_PATH=/opt/rocm

# Install OpenMPI
ENV OPENMPI_VERSION=4.1.x
ENV MPI_HOME=/usr/local/mpi
# Check if Open MPI is installed
RUN cd /tmp && \
git clone --recursive https://github.com/open-mpi/ompi.git -b v${OPENMPI_VERSION} && \
cd ompi && \
./autogen.pl && \
mkdir build && \
cd build && \
../configure --prefix=/usr/local/mpi --enable-orterun-prefix-by-default --enable-mpirun-prefix-by-default --enable-prte-prefix-by-default --with-rocm=/opt/rocm && \
make -j $(nproc) && \
make -j $(nproc) install && \
ldconfig && \
cd / && \
rm -rf /tmp/openmpi-${OPENMPI_VERSION}*

# Install Intel MLC
RUN cd /tmp && \
wget -q https://downloadmirror.intel.com/866182/mlc_v3.12.tgz -O mlc.tgz && \
tar xzf mlc.tgz Linux/mlc && \
cp ./Linux/mlc /usr/local/bin/ && \
rm -rf ./Linux mlc.tgz

# Install RCCL
RUN cd /opt/ && \
git clone -b release/rocm-rel-6.4 https://github.com/ROCmSoftwarePlatform/rccl.git && \
cd rccl && \
mkdir build && \
cd build && \
CXX=/opt/rocm/bin/hipcc cmake -DHIP_COMPILER=clang -DCMAKE_BUILD_TYPE=Release -DCMAKE_VERBOSE_MAKEFILE=1 \
-DCMAKE_PREFIX_PATH="${ROCM_PATH}/hsa;${ROCM_PATH}/hip;${ROCM_PATH}/share/rocm/cmake/;${ROCM_PATH}" \
.. && \
make -j${NUM_MAKE_JOBS}

# Install AMD SMI Python Library
RUN apt install amd-smi-lib -y && \
cd /opt/rocm/share/amd_smi && \
python3 -m pip install .

ENV PATH="/usr/local/mpi/bin:/opt/superbench/bin:/usr/local/bin/:/opt/rocm/hip/bin/:/opt/rocm/bin/:${PATH}" \
LD_LIBRARY_PATH="/opt/rccl/build:/usr/local/mpi/lib:/opt/rocm/lib:/usr/local/lib/:${LD_LIBRARY_PATH}" \
SB_HOME=/opt/superbench \
SB_MICRO_PATH=/opt/superbench \
ANSIBLE_DEPRECATION_WARNINGS=FALSE \
ANSIBLE_COLLECTIONS_PATH=/usr/share/ansible/collections

RUN echo PATH="$PATH" > /etc/environment && \
echo LD_LIBRARY_PATH="$LD_LIBRARY_PATH" >> /etc/environment && \
echo SB_MICRO_PATH="$SB_MICRO_PATH" >> /etc/environment

RUN apt install rocm-cmake -y && \
python3 -m pip install --upgrade pip wheel "setuptools>=69.0"

WORKDIR ${SB_HOME}

ADD third_party third_party
# perftest_rocm6.patch changes are already upstream in the submodule version
# rocm_megatron_lm: broken upstream (pretrain_deepseek.py missing in rocm_dev branch)
# apex_rocm: skipped — all apex imports in Megatron-DeepSpeed are guarded with try/except,
# superbench has zero direct apex usage, and PyTorch 2.7 has native fused optimizers/AMP.
RUN make RCCL_HOME=/opt/rccl/build/ ROCBLAS_BRANCH=release-staging/rocm-rel-6.4 HIPBLASLT_BRANCH=release-staging/rocm-rel-6.4 ROCM_VER=rocm-5.5.0 -C third_party rocm -o cpu_hpl -o cpu_stream -o megatron_lm -o rocm_hipblaslt -o rocm_megatron_lm -o apex_rocm
# Build hipblaslt separately with Tensile target-triple fix for ROCm 6.4 clang
# Also fix joblib race condition (github.com/joblib/joblib/issues/1788) in Python 3.12:
# joblib's _retrieve() iterates _jobs_set while callbacks modify it.
# Fix: copy the set before iterating. Patch all joblib instances system-wide.
RUN pip install "joblib>=1.4.2" && \
find / -path '*/joblib/parallel.py' -not -path '*/.git/*' -exec sed -i \
's/timeout_control_job = next(iter(self\._jobs_set), None)/timeout_control_job = next(iter(set(self._jobs_set)), None)/' {} +
RUN cd third_party && \
git clone -b release-staging/rocm-rel-6.4 https://github.com/ROCmSoftwarePlatform/hipBLASLt.git && \
sed -i 's/host-x86_64-unknown-linux,/host-x86_64-unknown-linux-gnu,/' \
hipBLASLt/tensilelite/Tensile/BuildCommands/SharedCommands.py && \
cd hipBLASLt && ./install.sh -dc && \
find /opt -path '*/joblib/parallel.py' -not -path '*/.git/*' -exec sed -i \
's/timeout_control_job = next(iter(self\._jobs_set), None)/timeout_control_job = next(iter(set(self._jobs_set)), None)/' {} + && \
cp -v build/release/clients/staging/hipblaslt-bench /opt/superbench/bin/
RUN cp -r /opt/superbench/third_party/hipBLASLt/build/release/hipblaslt-install/lib/* /opt/rocm/lib/ && \
cp -r /opt/superbench/third_party/hipBLASLt/build/release/hipblaslt-install/include/* /opt/rocm/include/
RUN cd third_party/Megatron/Megatron-DeepSpeed && \
git apply ../megatron_deepspeed_rocm6.patch

# Install TransformerEngine - pin to 386bd316 (before NVFP4/hip_fp4.h which needs ROCm 7.0+).
# Disable CK fused attention (aiter submodule has gfx950-only code); aotriton stays enabled.
RUN git clone --recursive https://github.com/ROCm/TransformerEngine.git && \
cd TransformerEngine && \
git checkout 386bd316 && \
git submodule update --init --recursive && \
NVTE_FRAMEWORK=pytorch \
NVTE_FUSED_ATTN_CK=0 \
NVTE_ROCM_ARCH=gfx942 \
python3 setup.py install
RUN python3 -c "import transformer_engine.pytorch; print('TE installed successfully')"

ADD . .
ENV USE_HIP_DATATYPE=1
ENV USE_HIPBLAS_COMPUTETYPE=1
RUN python3 -m pip install .[amdworker] && \
CXX=/opt/rocm/bin/hipcc make cppbuild && \
make postinstall

# Fix stale hypothesis plugin from base image (imports removed pkg_resources)
# and add test dependencies missing from the base image.
RUN python3 -m pip install --upgrade hypothesis setuptools pytest-timeout vcrpy
Loading
Loading