Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -186,4 +186,13 @@ tools/.addlicense.lock

# macOS metadata files
.DS_Store
._*
._*

# hipify (ROCm) build artifacts: generated in-tree by torch's BuildExtension
# from the csrc/ originals. Kept on disk for reference but never committed —
# they are derived, not source. See docs/rocm_phase2_report.md.
csrc/*_hip.cpp
csrc/inc/*_hip.hpp
# hipify renames cuda_utils.hpp -> hip_utils.hpp (substring swap, not a _hip
# suffix), so it needs an explicit entry. Still generated, not source.
csrc/inc/hip_utils.hpp
6 changes: 3 additions & 3 deletions csrc/allocator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include "constants.hpp"
#include "cuda_utils.hpp"
#include "ftensor.hpp"
#include "gpu_vmm.hpp"
#include "page.hpp"
#include "torch_utils.hpp"

Expand Down Expand Up @@ -320,9 +321,8 @@ void FTensorAllocator::init_cuda_() {
CHECK_DRV(cuCtxGetDevice(&dev));

int supportsVMM = 0;
CHECK_DRV(cuDeviceGetAttribute(
&supportsVMM, CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED,
dev));
CHECK_DRV(cuDeviceGetAttribute(&supportsVMM,
KVCACHED_GPU_ATTR_VMM_SUPPORTED, dev));
// LOGE("Supports VMM: %d", supportsVMM);

CUcontext context;
Expand Down
5 changes: 4 additions & 1 deletion csrc/ftensor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include "constants.hpp"
#include "cuda_utils.hpp"
#include "ftensor.hpp"
#include "gpu_vmm.hpp"
#include "page.hpp"

namespace kvcached {
Expand All @@ -23,7 +24,9 @@ static inline generic_ptr_t alloc_virtual_mem(const torch::Device &dev,
size_t offset = g_vaddr_allocated_offset.fetch_add(size);
if (dev.is_cuda()) {
CHECK_DRV(cuMemAddressReserve(reinterpret_cast<CUdeviceptr *>(&vaddr), size,
alignment_2mb, kStartAddr + offset, 0ULL));
alignment_2mb,
kvcached_vmm_addr(kStartAddr + offset),
0ULL));
} else {
vaddr = mmap(reinterpret_cast<void *>(kStartAddr + offset), size,
PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
Expand Down
45 changes: 45 additions & 0 deletions csrc/inc/gpu_vmm.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
// SPDX-FileCopyrightText: Copyright contributors to the kvcached project
// SPDX-License-Identifier: Apache-2.0

#pragma once

// Thin GPU-VMM compatibility shim (ROCm port, Phase 2).
//
// torch's hipify pass mechanically renames ~all of kvcached's CUDA driver
// surface (cu* -> hip*, CU* enums -> hip* enums). This header covers the two
// places hipify CANNOT: a symbol missing from its CUDA->HIP map, and an API
// signature that genuinely differs between the two VMM ABIs. The CUDA build
// (un-hipified original) takes the #else branch; the HIP build (hipify output,
// compiled with -D__HIP_PLATFORM_AMD__=1) takes the #if branch.

#include <cstddef>

#if defined(__HIP_PLATFORM_AMD__)

#include <hip/hip_runtime.h>

// hipify's CUDA2HIP table has no entry for this device attribute, so the
// literal token survives untranslated and fails to compile. Map it here.
// (Phase 0 spike confirmed this attribute reads 1 on the gfx90a MI250X.)
#define KVCACHED_GPU_ATTR_VMM_SUPPORTED \
hipDeviceAttributeVirtualMemoryManagementSupported

// CUDA's cuMemAddressReserve/Map/Unmap/SetAccess take an integer CUdeviceptr;
// HIP's hipMem* counterparts take a void*. A textual rename can't bridge a
// signature difference, so the fixed-address hint must be a real void* on HIP.
static inline void *kvcached_vmm_addr(size_t addr) {
return reinterpret_cast<void *>(addr);
}

#else

#include <cuda.h>

#define KVCACHED_GPU_ATTR_VMM_SUPPORTED \
CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED

static inline CUdeviceptr kvcached_vmm_addr(size_t addr) {
return static_cast<CUdeviceptr>(addr);
}

#endif
25 changes: 19 additions & 6 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,20 @@
ROOT_PATH = SCRIPT_PATH
CSRC_PATH = os.path.join(ROOT_PATH, "csrc")

# torch-ROCm sets torch.version.hip and leaves torch.version.cuda None.
IS_ROCM = torch.version.hip is not None
ROCM_PATH = os.environ.get("ROCM_PATH", "/opt/rocm")


def get_csrc_files(path) -> List[str]:
src_dir = Path(path)
# setuptools requires relative paths
# Filter out macOS AppleDouble metadata files (._* prefix)
# setuptools requires relative paths.
# Exclusions:
# - "._*": macOS AppleDouble metadata files.
# - "*_hip.cpp": hipify-generated artifacts.
cpp_files = [
str(f.relative_to(SCRIPT_PATH)) for f in src_dir.rglob("*.cpp")
if not f.name.startswith("._")
if not f.name.startswith("._") and not f.name.endswith("_hip.cpp")
]
return cpp_files

Expand All @@ -49,12 +55,19 @@ def get_extensions():
"-std=c++17", f"-D_GLIBCXX_USE_CXX11_ABI={int(cxx_abi)}"
]

gpu_lib = "amdhip64" if IS_ROCM else "cuda"
inc_dirs = include_paths() + [os.path.join(CSRC_PATH, "inc")]
lib_dirs = library_paths()
if IS_ROCM:
inc_dirs.append(os.path.join(ROCM_PATH, "include"))
lib_dirs.append(os.path.join(ROCM_PATH, "lib"))

vmm_ops_module = CUDAExtension(
"kvcached.vmm_ops",
csrc_files,
include_dirs=include_paths() + [os.path.join(CSRC_PATH, "inc")],
library_dirs=library_paths(),
libraries=["torch", "torch_cpu", "torch_python", "cuda"],
include_dirs=inc_dirs,
library_dirs=lib_dirs,
libraries=["torch", "torch_cpu", "torch_python", gpu_lib],
extra_compile_args={
"cxx": extra_compile_args,
"nvcc": extra_compile_args
Expand Down