From 2d1f7c67c9dd70aa760bf0bd4081a300886bce87 Mon Sep 17 00:00:00 2001 From: hieple-moreh Date: Tue, 19 May 2026 15:51:31 +0700 Subject: [PATCH] Modify setup.py, csrc/ to build extension on a ROCm box instead of only CUDA --- .gitignore | 11 ++++++++++- csrc/allocator.cpp | 6 +++--- csrc/ftensor.cpp | 5 ++++- csrc/inc/gpu_vmm.hpp | 45 ++++++++++++++++++++++++++++++++++++++++++++ setup.py | 25 ++++++++++++++++++------ 5 files changed, 81 insertions(+), 11 deletions(-) create mode 100644 csrc/inc/gpu_vmm.hpp diff --git a/.gitignore b/.gitignore index d9a5da5f..9b26bb68 100644 --- a/.gitignore +++ b/.gitignore @@ -186,4 +186,13 @@ tools/.addlicense.lock # macOS metadata files .DS_Store -._* \ No newline at end of file +._* + +# hipify (ROCm) build artifacts: generated in-tree by torch's BuildExtension +# from the csrc/ originals. Kept on disk for reference but never committed — +# they are derived, not source. See docs/rocm_phase2_report.md. +csrc/*_hip.cpp +csrc/inc/*_hip.hpp +# hipify renames cuda_utils.hpp -> hip_utils.hpp (substring swap, not a _hip +# suffix), so it needs an explicit entry. Still generated, not source. +csrc/inc/hip_utils.hpp diff --git a/csrc/allocator.cpp b/csrc/allocator.cpp index 9adb2715..6fd4b48f 100644 --- a/csrc/allocator.cpp +++ b/csrc/allocator.cpp @@ -10,6 +10,7 @@ #include "constants.hpp" #include "cuda_utils.hpp" #include "ftensor.hpp" +#include "gpu_vmm.hpp" #include "page.hpp" #include "torch_utils.hpp" @@ -320,9 +321,8 @@ void FTensorAllocator::init_cuda_() { CHECK_DRV(cuCtxGetDevice(&dev)); int supportsVMM = 0; - CHECK_DRV(cuDeviceGetAttribute( - &supportsVMM, CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED, - dev)); + CHECK_DRV(cuDeviceGetAttribute(&supportsVMM, + KVCACHED_GPU_ATTR_VMM_SUPPORTED, dev)); // LOGE("Supports VMM: %d", supportsVMM); CUcontext context; diff --git a/csrc/ftensor.cpp b/csrc/ftensor.cpp index e4b4e76f..cc0ce9bc 100644 --- a/csrc/ftensor.cpp +++ b/csrc/ftensor.cpp @@ -7,6 +7,7 @@ #include "constants.hpp" #include "cuda_utils.hpp" #include "ftensor.hpp" +#include "gpu_vmm.hpp" #include "page.hpp" namespace kvcached { @@ -23,7 +24,9 @@ static inline generic_ptr_t alloc_virtual_mem(const torch::Device &dev, size_t offset = g_vaddr_allocated_offset.fetch_add(size); if (dev.is_cuda()) { CHECK_DRV(cuMemAddressReserve(reinterpret_cast(&vaddr), size, - alignment_2mb, kStartAddr + offset, 0ULL)); + alignment_2mb, + kvcached_vmm_addr(kStartAddr + offset), + 0ULL)); } else { vaddr = mmap(reinterpret_cast(kStartAddr + offset), size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); diff --git a/csrc/inc/gpu_vmm.hpp b/csrc/inc/gpu_vmm.hpp new file mode 100644 index 00000000..a59b4024 --- /dev/null +++ b/csrc/inc/gpu_vmm.hpp @@ -0,0 +1,45 @@ +// SPDX-FileCopyrightText: Copyright contributors to the kvcached project +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +// Thin GPU-VMM compatibility shim (ROCm port, Phase 2). +// +// torch's hipify pass mechanically renames ~all of kvcached's CUDA driver +// surface (cu* -> hip*, CU* enums -> hip* enums). This header covers the two +// places hipify CANNOT: a symbol missing from its CUDA->HIP map, and an API +// signature that genuinely differs between the two VMM ABIs. The CUDA build +// (un-hipified original) takes the #else branch; the HIP build (hipify output, +// compiled with -D__HIP_PLATFORM_AMD__=1) takes the #if branch. + +#include + +#if defined(__HIP_PLATFORM_AMD__) + +#include + +// hipify's CUDA2HIP table has no entry for this device attribute, so the +// literal token survives untranslated and fails to compile. Map it here. +// (Phase 0 spike confirmed this attribute reads 1 on the gfx90a MI250X.) +#define KVCACHED_GPU_ATTR_VMM_SUPPORTED \ + hipDeviceAttributeVirtualMemoryManagementSupported + +// CUDA's cuMemAddressReserve/Map/Unmap/SetAccess take an integer CUdeviceptr; +// HIP's hipMem* counterparts take a void*. A textual rename can't bridge a +// signature difference, so the fixed-address hint must be a real void* on HIP. +static inline void *kvcached_vmm_addr(size_t addr) { + return reinterpret_cast(addr); +} + +#else + +#include + +#define KVCACHED_GPU_ATTR_VMM_SUPPORTED \ + CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED + +static inline CUdeviceptr kvcached_vmm_addr(size_t addr) { + return static_cast(addr); +} + +#endif diff --git a/setup.py b/setup.py index 3ea4ce83..83d197e5 100644 --- a/setup.py +++ b/setup.py @@ -27,14 +27,20 @@ ROOT_PATH = SCRIPT_PATH CSRC_PATH = os.path.join(ROOT_PATH, "csrc") +# torch-ROCm sets torch.version.hip and leaves torch.version.cuda None. +IS_ROCM = torch.version.hip is not None +ROCM_PATH = os.environ.get("ROCM_PATH", "/opt/rocm") + def get_csrc_files(path) -> List[str]: src_dir = Path(path) - # setuptools requires relative paths - # Filter out macOS AppleDouble metadata files (._* prefix) + # setuptools requires relative paths. + # Exclusions: + # - "._*": macOS AppleDouble metadata files. + # - "*_hip.cpp": hipify-generated artifacts. cpp_files = [ str(f.relative_to(SCRIPT_PATH)) for f in src_dir.rglob("*.cpp") - if not f.name.startswith("._") + if not f.name.startswith("._") and not f.name.endswith("_hip.cpp") ] return cpp_files @@ -49,12 +55,19 @@ def get_extensions(): "-std=c++17", f"-D_GLIBCXX_USE_CXX11_ABI={int(cxx_abi)}" ] + gpu_lib = "amdhip64" if IS_ROCM else "cuda" + inc_dirs = include_paths() + [os.path.join(CSRC_PATH, "inc")] + lib_dirs = library_paths() + if IS_ROCM: + inc_dirs.append(os.path.join(ROCM_PATH, "include")) + lib_dirs.append(os.path.join(ROCM_PATH, "lib")) + vmm_ops_module = CUDAExtension( "kvcached.vmm_ops", csrc_files, - include_dirs=include_paths() + [os.path.join(CSRC_PATH, "inc")], - library_dirs=library_paths(), - libraries=["torch", "torch_cpu", "torch_python", "cuda"], + include_dirs=inc_dirs, + library_dirs=lib_dirs, + libraries=["torch", "torch_cpu", "torch_python", gpu_lib], extra_compile_args={ "cxx": extra_compile_args, "nvcc": extra_compile_args