moreh-dev · hieple-moreh · May 19, 2026
diff --git a/.gitignore b/.gitignore
@@ -186,4 +186,13 @@ tools/.addlicense.lock
 
 # macOS metadata files
 .DS_Store
-._*
+._*
+
+# hipify (ROCm) build artifacts: generated in-tree by torch's BuildExtension
+# from the csrc/ originals. Kept on disk for reference but never committed —
+# they are derived, not source. See docs/rocm_phase2_report.md.
+csrc/*_hip.cpp
+csrc/inc/*_hip.hpp
+# hipify renames cuda_utils.hpp -> hip_utils.hpp (substring swap, not a _hip
+# suffix), so it needs an explicit entry. Still generated, not source.
+csrc/inc/hip_utils.hpp
diff --git a/csrc/allocator.cpp b/csrc/allocator.cpp
@@ -10,6 +10,7 @@
 #include "constants.hpp"
 #include "cuda_utils.hpp"
 #include "ftensor.hpp"
+#include "gpu_vmm.hpp"
 #include "page.hpp"
 #include "torch_utils.hpp"
 
@@ -320,9 +321,8 @@ void FTensorAllocator::init_cuda_() {
   CHECK_DRV(cuCtxGetDevice(&dev));
 
   int supportsVMM = 0;
-  CHECK_DRV(cuDeviceGetAttribute(
-      &supportsVMM, CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED,
-      dev));
+  CHECK_DRV(cuDeviceGetAttribute(&supportsVMM,
+                                 KVCACHED_GPU_ATTR_VMM_SUPPORTED, dev));
   // LOGE("Supports VMM: %d", supportsVMM);
 
   CUcontext context;

diff --git a/csrc/ftensor.cpp b/csrc/ftensor.cpp
@@ -7,6 +7,7 @@
 #include "constants.hpp"
 #include "cuda_utils.hpp"
 #include "ftensor.hpp"
+#include "gpu_vmm.hpp"
 #include "page.hpp"
 
 namespace kvcached {
@@ -23,7 +24,9 @@ static inline generic_ptr_t alloc_virtual_mem(const torch::Device &dev,
   size_t offset = g_vaddr_allocated_offset.fetch_add(size);
   if (dev.is_cuda()) {
     CHECK_DRV(cuMemAddressReserve(reinterpret_cast<CUdeviceptr *>(&vaddr), size,
-                                  alignment_2mb, kStartAddr + offset, 0ULL));
+                                  alignment_2mb,
+                                  kvcached_vmm_addr(kStartAddr + offset),
+                                  0ULL));
   } else {
     vaddr = mmap(reinterpret_cast<void *>(kStartAddr + offset), size,
                  PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);

diff --git a/csrc/inc/gpu_vmm.hpp b/csrc/inc/gpu_vmm.hpp
@@ -0,0 +1,45 @@
+// SPDX-FileCopyrightText: Copyright contributors to the kvcached project
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+// Thin GPU-VMM compatibility shim (ROCm port, Phase 2).
+//
+// torch's hipify pass mechanically renames ~all of kvcached's CUDA driver
+// surface (cu* -> hip*, CU* enums -> hip* enums). This header covers the two
+// places hipify CANNOT: a symbol missing from its CUDA->HIP map, and an API
+// signature that genuinely differs between the two VMM ABIs. The CUDA build
+// (un-hipified original) takes the #else branch; the HIP build (hipify output,
+// compiled with -D__HIP_PLATFORM_AMD__=1) takes the #if branch.
+
+#include <cstddef>
+
+#if defined(__HIP_PLATFORM_AMD__)
+
+#include <hip/hip_runtime.h>
+
+// hipify's CUDA2HIP table has no entry for this device attribute, so the
+// literal token survives untranslated and fails to compile. Map it here.
+// (Phase 0 spike confirmed this attribute reads 1 on the gfx90a MI250X.)
+#define KVCACHED_GPU_ATTR_VMM_SUPPORTED                                        \
+  hipDeviceAttributeVirtualMemoryManagementSupported
+
+// CUDA's cuMemAddressReserve/Map/Unmap/SetAccess take an integer CUdeviceptr;
+// HIP's hipMem* counterparts take a void*. A textual rename can't bridge a
+// signature difference, so the fixed-address hint must be a real void* on HIP.
+static inline void *kvcached_vmm_addr(size_t addr) {
+  return reinterpret_cast<void *>(addr);
+}
+
+#else
+
+#include <cuda.h>
+
+#define KVCACHED_GPU_ATTR_VMM_SUPPORTED                                        \
+  CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
+
+static inline CUdeviceptr kvcached_vmm_addr(size_t addr) {
+  return static_cast<CUdeviceptr>(addr);
+}
+
+#endif
diff --git a/setup.py b/setup.py
@@ -27,14 +27,20 @@
 ROOT_PATH = SCRIPT_PATH
 CSRC_PATH = os.path.join(ROOT_PATH, "csrc")
 
+# torch-ROCm sets torch.version.hip and leaves torch.version.cuda None.
+IS_ROCM = torch.version.hip is not None
+ROCM_PATH = os.environ.get("ROCM_PATH", "/opt/rocm")
+
 
 def get_csrc_files(path) -> List[str]:
     src_dir = Path(path)
-    # setuptools requires relative paths
-    # Filter out macOS AppleDouble metadata files (._* prefix)
+    # setuptools requires relative paths.
+    # Exclusions:
+    #  - "._*": macOS AppleDouble metadata files.
+    #  - "*_hip.cpp": hipify-generated artifacts.
     cpp_files = [
         str(f.relative_to(SCRIPT_PATH)) for f in src_dir.rglob("*.cpp")
-        if not f.name.startswith("._")
+        if not f.name.startswith("._") and not f.name.endswith("_hip.cpp")
     ]
     return cpp_files
 
@@ -49,12 +55,19 @@ def get_extensions():
         "-std=c++17", f"-D_GLIBCXX_USE_CXX11_ABI={int(cxx_abi)}"
     ]
 
+    gpu_lib = "amdhip64" if IS_ROCM else "cuda"
+    inc_dirs = include_paths() + [os.path.join(CSRC_PATH, "inc")]
+    lib_dirs = library_paths()
+    if IS_ROCM:
+        inc_dirs.append(os.path.join(ROCM_PATH, "include"))
+        lib_dirs.append(os.path.join(ROCM_PATH, "lib"))
+
     vmm_ops_module = CUDAExtension(
         "kvcached.vmm_ops",
         csrc_files,
-        include_dirs=include_paths() + [os.path.join(CSRC_PATH, "inc")],
-        library_dirs=library_paths(),
-        libraries=["torch", "torch_cpu", "torch_python", "cuda"],
+        include_dirs=inc_dirs,
+        library_dirs=lib_dirs,
+        libraries=["torch", "torch_cpu", "torch_python", gpu_lib],
         extra_compile_args={
             "cxx": extra_compile_args,
             "nvcc": extra_compile_args