Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .github/workflows/build-image.yml
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,12 @@ jobs:
platforms: linux/amd64
runner: ubuntu-latest
build_args: "NUM_MAKE_JOBS=8"
- name: rocm6.4
dockerfile: rocm6.4.x
tags: superbench/main:rocm6.4
platforms: linux/amd64
runner: [self-hosted, linux/amd64, rocm]
build_args: "NUM_MAKE_JOBS=16"
# - name: rocm6.2
# dockerfile: rocm6.2.x
# tags: superbench/main:rocm6.2
Expand Down
2 changes: 1 addition & 1 deletion dockerfile/rocm6.2.x.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ RUN cd /tmp && \

# Install Intel MLC
RUN cd /tmp && \
wget -q https://downloadmirror.intel.com/763324/mlc_v3.10.tgz -O mlc.tgz && \
wget -q https://downloadmirror.intel.com/866182/mlc_v3.12.tgz -O mlc.tgz && \
tar xzf mlc.tgz Linux/mlc && \
cp ./Linux/mlc /usr/local/bin/ && \
rm -rf ./Linux mlc.tgz
Expand Down
215 changes: 215 additions & 0 deletions dockerfile/rocm6.4.x.dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,215 @@
ARG BASE_IMAGE=rocm/pytorch:rocm6.4.4_ubuntu24.04_py3.12_pytorch_release_2.7.1

FROM ${BASE_IMAGE}

# OS:
# - Ubuntu: 24.04
# - Docker Client: 20.10.8
# ROCm:
# - ROCm: 6.4
# Lib:
# - torch: 2.7.1
# - rccl: release/rocm-rel-6.4
# - hipblaslt: release-staging/rocm-rel-6.4
# - rocblas: release-staging/rocm-rel-6.4
# - openmpi: 4.1.x
# Intel:
# - mlc: v3.12

LABEL maintainer="SuperBench"

ENV DEBIAN_FRONTEND=noninteractive
RUN apt-get update && \
apt-get -q install -y --no-install-recommends \
autoconf \
automake \
bc \
build-essential \
curl \
dmidecode \
git \
hipify-clang \
iproute2 \
jq \
libaio-dev \
libboost-program-options-dev \
libcap2 \
libcurl4-openssl-dev \
libnuma-dev \
libpci-dev \
libssl-dev \
libtinfo6 \
libtool \
lshw \
net-tools \
numactl \
openssh-client \
openssh-server \
pciutils \
python3-mpi4py \
rsync \
sudo \
util-linux \
vim \
wget \
&& \
rm -rf /tmp/*

ARG NUM_MAKE_JOBS=64

# Check if CMake is installed and its version
RUN cmake_version=$(cmake --version 2>/dev/null | grep -oP "(?<=cmake version )(\d+\.\d+)" || echo "0.0") && \
required_version="3.24.1" && \
if [ "$(printf "%s\n" "$required_version" "$cmake_version" | sort -V | head -n 1)" != "$required_version" ]; then \
echo "existing cmake version is ${cmake_version}" && \
cd /tmp && \
wget -q https://github.com/Kitware/CMake/releases/download/v${required_version}/cmake-${required_version}.tar.gz && \
tar xzf cmake-${required_version}.tar.gz && \
cd cmake-${required_version} && \
./bootstrap --prefix=/usr --no-system-curl --parallel=16 && \
make -j ${NUM_MAKE_JOBS} && \
make install && \
rm -rf /tmp/cmake-${required_version}* \
else \
echo "CMake version ${cmake_version} is greater than or equal to ${required_version}"; \
fi

# Install Docker
ENV DOCKER_VERSION=20.10.8
RUN cd /tmp && \
wget -q https://download.docker.com/linux/static/stable/x86_64/docker-${DOCKER_VERSION}.tgz -O docker.tgz && \
tar --extract --file docker.tgz --strip-components 1 --directory /usr/local/bin/ && \
rm docker.tgz

# Update system config
RUN mkdir -p /root/.ssh && \
touch /root/.ssh/authorized_keys && \
mkdir -p /var/run/sshd && \
sed -i "s/[# ]*PermitRootLogin prohibit-password/PermitRootLogin yes/" /etc/ssh/sshd_config && \
sed -i "s/[# ]*PermitUserEnvironment no/PermitUserEnvironment yes/" /etc/ssh/sshd_config && \
sed -i "s/[# ]*Port.*/Port 22/" /etc/ssh/sshd_config && \
echo "* soft nofile 1048576\n* hard nofile 1048576" >> /etc/security/limits.conf && \
echo "root soft nofile 1048576\nroot hard nofile 1048576" >> /etc/security/limits.conf


# Set Ubuntu version
ENV UBUNTU_VERSION=24.04

# Install OFED
ENV OFED_VERSION=24.10-1.1.4.0
# Check if ofed_info is present and has a version
RUN if ! command -v ofed_info >/dev/null 2>&1; then \
echo "OFED not found. Installing OFED..."; \
cd /tmp && \
wget -q http://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu${UBUNTU_VERSION}-x86_64.tgz && \
tar xzf MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu${UBUNTU_VERSION}-x86_64.tgz && \
PATH=/usr/bin:${PATH} MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu${UBUNTU_VERSION}-x86_64/mlnxofedinstall --user-space-only --without-fw-update --force --all && \
rm -rf MLNX_OFED_LINUX-${OFED_VERSION}* ; \
fi

ENV ROCM_PATH=/opt/rocm

# Install OpenMPI
ENV OPENMPI_VERSION=4.1.x
ENV MPI_HOME=/usr/local/mpi
# Check if Open MPI is installed
RUN cd /tmp && \
git clone --recursive https://github.com/open-mpi/ompi.git -b v${OPENMPI_VERSION} && \
cd ompi && \
./autogen.pl && \
mkdir build && \
cd build && \
../configure --prefix=/usr/local/mpi --enable-orterun-prefix-by-default --enable-mpirun-prefix-by-default --enable-prte-prefix-by-default --with-rocm=/opt/rocm && \
make -j $(nproc) && \
make -j $(nproc) install && \
ldconfig && \
cd / && \
rm -rf /tmp/openmpi-${OPENMPI_VERSION}*

# Install Intel MLC
RUN cd /tmp && \
wget -q https://downloadmirror.intel.com/866182/mlc_v3.12.tgz -O mlc.tgz && \
tar xzf mlc.tgz Linux/mlc && \
cp ./Linux/mlc /usr/local/bin/ && \
rm -rf ./Linux mlc.tgz

# Install RCCL
RUN cd /opt/ && \
git clone -b release/rocm-rel-6.4 https://github.com/ROCmSoftwarePlatform/rccl.git && \
cd rccl && \
mkdir build && \
cd build && \
CXX=/opt/rocm/bin/hipcc cmake -DHIP_COMPILER=clang -DCMAKE_BUILD_TYPE=Release -DCMAKE_VERBOSE_MAKEFILE=1 \
-DCMAKE_PREFIX_PATH="${ROCM_PATH}/hsa;${ROCM_PATH}/hip;${ROCM_PATH}/share/rocm/cmake/;${ROCM_PATH}" \
.. && \
make -j${NUM_MAKE_JOBS}

# Install AMD SMI Python Library
RUN apt install amd-smi-lib -y && \
cd /opt/rocm/share/amd_smi && \
python3 -m pip install .

ENV PATH="/usr/local/mpi/bin:/opt/superbench/bin:/usr/local/bin/:/opt/rocm/hip/bin/:/opt/rocm/bin/:${PATH}" \
LD_LIBRARY_PATH="/opt/rccl/build:/usr/local/mpi/lib:/opt/rocm/lib:/usr/local/lib/:${LD_LIBRARY_PATH}" \
SB_HOME=/opt/superbench \
SB_MICRO_PATH=/opt/superbench \
ANSIBLE_DEPRECATION_WARNINGS=FALSE \
ANSIBLE_COLLECTIONS_PATH=/usr/share/ansible/collections

RUN echo PATH="$PATH" > /etc/environment && \
echo LD_LIBRARY_PATH="$LD_LIBRARY_PATH" >> /etc/environment && \
echo SB_MICRO_PATH="$SB_MICRO_PATH" >> /etc/environment

RUN apt install rocm-cmake -y && \
python3 -m pip install --upgrade pip wheel "setuptools>=69.0"

WORKDIR ${SB_HOME}

ADD third_party third_party
# perftest_rocm6.patch changes are already upstream in the submodule version
# rocm_megatron_lm: broken upstream (pretrain_deepseek.py missing in rocm_dev branch)
# apex_rocm: skipped — all apex imports in Megatron-DeepSpeed are guarded with try/except,
# superbench has zero direct apex usage, and PyTorch 2.7 has native fused optimizers/AMP.
RUN make RCCL_HOME=/opt/rccl/build/ ROCBLAS_BRANCH=release-staging/rocm-rel-6.4 HIPBLASLT_BRANCH=release-staging/rocm-rel-6.4 ROCM_VER=rocm-5.5.0 -C third_party rocm -o cpu_hpl -o cpu_stream -o megatron_lm -o rocm_hipblaslt -o rocm_megatron_lm -o apex_rocm
# Build hipblaslt separately with Tensile target-triple fix for ROCm 6.4 clang
# Also fix joblib race condition (github.com/joblib/joblib/issues/1788) in Python 3.12:
# joblib's _retrieve() iterates _jobs_set while callbacks modify it.
# Fix: copy the set before iterating. Patch all joblib instances system-wide.
RUN pip install "joblib>=1.4.2" && \
find / -path '*/joblib/parallel.py' -not -path '*/.git/*' -exec sed -i \
's/timeout_control_job = next(iter(self\._jobs_set), None)/timeout_control_job = next(iter(set(self._jobs_set)), None)/' {} +
RUN cd third_party && \
git clone -b release-staging/rocm-rel-6.4 https://github.com/ROCmSoftwarePlatform/hipBLASLt.git && \
sed -i 's/host-x86_64-unknown-linux,/host-x86_64-unknown-linux-gnu,/' \
hipBLASLt/tensilelite/Tensile/BuildCommands/SharedCommands.py && \
cd hipBLASLt && ./install.sh -dc && \
find /opt -path '*/joblib/parallel.py' -not -path '*/.git/*' -exec sed -i \
's/timeout_control_job = next(iter(self\._jobs_set), None)/timeout_control_job = next(iter(set(self._jobs_set)), None)/' {} + && \
Comment on lines +177 to +187
cp -v build/release/clients/staging/hipblaslt-bench /opt/superbench/bin/
Comment on lines +177 to +188
RUN cp -r /opt/superbench/third_party/hipBLASLt/build/release/hipblaslt-install/lib/* /opt/rocm/lib/ && \
cp -r /opt/superbench/third_party/hipBLASLt/build/release/hipblaslt-install/include/* /opt/rocm/include/
RUN cd third_party/Megatron/Megatron-DeepSpeed && \
git apply ../megatron_deepspeed_rocm6.patch

# Install TransformerEngine - pin to 386bd316 (before NVFP4/hip_fp4.h which needs ROCm 7.0+).
# Disable CK fused attention (aiter submodule has gfx950-only code); aotriton stays enabled.
RUN git clone --recursive https://github.com/ROCm/TransformerEngine.git && \
cd TransformerEngine && \
git checkout 386bd316 && \
git submodule update --init --recursive && \
NVTE_FRAMEWORK=pytorch \
NVTE_FUSED_ATTN_CK=0 \
NVTE_ROCM_ARCH=gfx942 \
python3 setup.py install
RUN python3 -c "import transformer_engine.pytorch; print('TE installed successfully')"

ADD . .
ENV USE_HIP_DATATYPE=1
ENV USE_HIPBLAS_COMPUTETYPE=1
RUN python3 -m pip install .[amdworker] && \
CXX=/opt/rocm/bin/hipcc make cppbuild && \
make postinstall

# Fix stale hypothesis plugin from base image (imports removed pkg_resources)
# and add test dependencies missing from the base image.
RUN python3 -m pip install --upgrade hypothesis setuptools pytest-timeout vcrpy
1 change: 1 addition & 0 deletions superbench/benchmarks/micro_benchmarks/gpu_stream.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,3 +117,4 @@ def _process_raw_result(self, cmd_idx, raw_output):


BenchmarkRegistry.register_benchmark('gpu-stream', GpuStreamBenchmark, platform=Platform.CUDA)
BenchmarkRegistry.register_benchmark('gpu-stream', GpuStreamBenchmark, platform=Platform.ROCM)
Comment on lines 119 to +120
116 changes: 98 additions & 18 deletions superbench/benchmarks/micro_benchmarks/gpu_stream/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,34 +11,114 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CUDA_STANDARD 17)
set(CMAKE_CUDA_STANDARD_REQUIRED ON)

# Source files (CUDA-style names; hipify-perl translates them on ROCm path)
set(GPU_STREAM_SOURCES
gpu_stream_test.cpp
gpu_stream_utils.cpp
gpu_stream.cu
gpu_stream_kernels.cu
)
set(GPU_STREAM_HEADERS
gpu_stream.hpp
gpu_stream_utils.hpp
gpu_stream_kernels.hpp
)

find_package(CUDAToolkit QUIET)

if(NOT CUDAToolkit_FOUND)
message(WARNING "gpu_stream: CUDA not found, skipping build (requires NVIDIA GPU with NVML)")
if(CUDAToolkit_FOUND)
# NVIDIA / CUDA path
find_library(NVML_LIBRARY nvidia-ml PATHS ${CUDAToolkit_LIBRARY_DIR} PATH_SUFFIXES stubs)
if(NOT NVML_LIBRARY)
message(WARNING "gpu_stream: NVML (nvidia-ml) not found, skipping build")
return()
endif()

message(STATUS "Found CUDA: ${CUDAToolkit_VERSION}")
include(../cuda_common.cmake)
add_executable(gpu_stream ${GPU_STREAM_SOURCES})
set_property(TARGET gpu_stream PROPERTY CUDA_ARCHITECTURES ${NVCC_ARCHS_SUPPORTED})
target_include_directories(gpu_stream PRIVATE ${CUDAToolkit_INCLUDE_DIRS})
target_link_libraries(gpu_stream numa ${NVML_LIBRARY})
install(TARGETS gpu_stream RUNTIME DESTINATION bin)
return()
endif()

# Check for NVML (nvidia-ml) library, required for querying memory clock rates
find_library(NVML_LIBRARY nvidia-ml PATHS ${CUDAToolkit_LIBRARY_DIR} PATH_SUFFIXES stubs)
if(NOT NVML_LIBRARY)
message(WARNING "gpu_stream: NVML (nvidia-ml) not found, skipping build")
# ROCm / HIP path
include(../rocm_common.cmake)
find_package(hip QUIET)

if(NOT hip_FOUND)
message(WARNING "gpu_stream: neither CUDA nor HIP found, skipping build")
return()
endif()

message(STATUS "Found CUDA: " ${CUDAToolkit_VERSION})
message(STATUS "Found ROCm: ${HIP_VERSION}")
enable_language(HIP)

# Source files
set(SOURCES
gpu_stream_test.cpp
gpu_stream_utils.cpp
gpu_stream.cu
gpu_stream_kernels.cu
# rocm_smi is required for actual memory clock queries (NVML equivalent).
find_path(ROCM_SMI_INCLUDE_DIR rocm_smi/rocm_smi.h
HINTS ${ROCM_PATH} /opt/rocm
PATH_SUFFIXES include
)
find_library(ROCM_SMI_LIBRARY
NAMES rocm_smi64
HINTS ${ROCM_PATH} /opt/rocm
PATH_SUFFIXES lib lib64
)
if(NOT ROCM_SMI_INCLUDE_DIR OR NOT ROCM_SMI_LIBRARY)
message(WARNING "gpu_stream: rocm_smi not found, skipping build")
return()
endif()
message(STATUS "Found rocm_smi: ${ROCM_SMI_LIBRARY}")

# Translate the CUDA-named sources/headers to HIP via hipify-perl into the
# build directory. hipify-perl translates cuda* identifiers and CUDA includes
# but leaves NVML/rocm_smi symbols untouched (they are gated by #ifdef in the
# source).
find_program(HIPIFY_PERL hipify-perl REQUIRED)

set(HIPIFY_OUTPUT_DIR ${CMAKE_CURRENT_BINARY_DIR}/hipified)
file(MAKE_DIRECTORY ${HIPIFY_OUTPUT_DIR})

set(HIP_SOURCES "")
set(HIPIFY_OUTPUTS "")
foreach(SRC IN LISTS GPU_STREAM_SOURCES GPU_STREAM_HEADERS)
set(SRC_IN ${CMAKE_CURRENT_SOURCE_DIR}/${SRC})
set(SRC_OUT ${HIPIFY_OUTPUT_DIR}/${SRC})
add_custom_command(
OUTPUT ${SRC_OUT}
COMMAND ${HIPIFY_PERL} -o ${SRC_OUT} ${SRC_IN}
DEPENDS ${SRC_IN}
COMMENT "hipify-perl ${SRC}"
VERBATIM
)
list(APPEND HIPIFY_OUTPUTS ${SRC_OUT})
if(SRC MATCHES "\\.(cpp|cu)$")
list(APPEND HIP_SOURCES ${SRC_OUT})
endif()
endforeach()

include(../cuda_common.cmake)
add_executable(gpu_stream ${SOURCES})
set_property(TARGET gpu_stream PROPERTY CUDA_ARCHITECTURES ${NVCC_ARCHS_SUPPORTED})
target_include_directories(gpu_stream PRIVATE ${CUDAToolkit_INCLUDE_DIRS})
target_link_libraries(gpu_stream numa ${NVML_LIBRARY})
# Aggregate target so all hipified outputs (including headers) are produced
# before any object is compiled.
add_custom_target(gpu_stream_hipify DEPENDS ${HIPIFY_OUTPUTS})

set_source_files_properties(${HIP_SOURCES} PROPERTIES LANGUAGE HIP)

include(CheckSymbolExists)
check_symbol_exists("hipDeviceMallocUncached" "hip/hip_runtime_api.h" HIP_UNCACHED_MEMORY)
if(HIP_UNCACHED_MEMORY)
add_compile_definitions(HIP_UNCACHED_MEMORY)
endif()

set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2")

add_executable(gpu_stream ${HIP_SOURCES})
Comment on lines +110 to +116
add_dependencies(gpu_stream gpu_stream_hipify)
target_include_directories(gpu_stream PRIVATE
${HIPIFY_OUTPUT_DIR}
${ROCM_SMI_INCLUDE_DIR}
)
target_link_libraries(gpu_stream numa hip::device ${ROCM_SMI_LIBRARY})

install(TARGETS gpu_stream RUNTIME DESTINATION bin)
Loading
Loading