NVIDIA · samnordmann · Feb 3, 2026 · Feb 12, 2026 · Feb 12, 2026 · Feb 12, 2026
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -399,19 +399,20 @@ endif()
 # "private" (not installed) static library.
 add_library(codegen_internal OBJECT ${NVFUSER_SRCS})
 
+
 if(NOT MSVC)
   if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
     target_compile_options(codegen_internal PRIVATE
-      -Wall -Wno-unused-function -Werror
+      $<$<COMPILE_LANGUAGE:CXX>:-Wall -Wno-unused-function -Werror
 
       # These warnings are not treated as errors because of gcc 12.2 used in
       # manylinux image. consider enable this when we upgrade.
       # linking comment:
       # https://github.com/NVIDIA/Fuser/pull/3001#discussion_r1772551266
-      -Wno-error=restrict -Wno-error=stringop-overflow -Wno-error=maybe-uninitialized)
+      -Wno-error=restrict -Wno-error=stringop-overflow -Wno-error=maybe-uninitialized>)
   else()
     target_compile_options(codegen_internal PRIVATE
-      -Wall -Wno-unused-function -Werror)
+      $<$<COMPILE_LANGUAGE:CXX>:-Wall -Wno-unused-function -Werror>)
   endif()
 endif()
 
@@ -423,6 +424,9 @@ if (NVMMH_FOUND)
 endif()
 target_include_directories(codegen_internal SYSTEM PUBLIC
   ${CMAKE_SOURCE_DIR}/third_party/flatbuffers/include
+  ${NVFUSER_THIRD_PARTY_DIR}/cutlass/include
+  ${NVFUSER_THIRD_PARTY_DIR}/cutlass/tools/util/include
+  /usr/local/cuda/include/cccl
   PRIVATE
   ${CUDA_INCLUDE_DIRS}
 )
@@ -919,7 +923,7 @@ function(add_test_without_main TEST_NAME TEST_SRC ADDITIONAL_LINK)
 
   if(NOT MSVC)
     target_compile_options(${TEST_NAME} PRIVATE
-      -Wall -Wno-unused-function -Werror
+      $<$<COMPILE_LANGUAGE:CXX>:-Wall -Wno-unused-function -Werror>
     )
   endif()
 endfunction()
@@ -1019,6 +1023,8 @@ if(BUILD_TEST)
     ${NVFUSER_ROOT}/tests/cpp/test_multidevice_host_ir.cpp
     ${NVFUSER_ROOT}/tests/cpp/test_multidevice_host_ir_overlap.cpp
     ${NVFUSER_ROOT}/tests/cpp/test_multidevice_ipc.cpp
+    ${NVFUSER_ROOT}/tests/cpp/test_multidevice_fused_remote_matmul.cpp
+    ${NVFUSER_ROOT}/tests/cpp/test_multidevice_fused_remote_matmul_kernel.cu
     ${NVFUSER_ROOT}/tests/cpp/test_multidevice_lower_communication.cpp
     ${NVFUSER_ROOT}/tests/cpp/test_multidevice_lower_communication_cuda.cpp
     ${NVFUSER_ROOT}/tests/cpp/test_multidevice_matmul.cpp

diff --git a/csrc/multidevice/symmetric_tensor.cpp b/csrc/multidevice/symmetric_tensor.cpp
@@ -253,6 +253,11 @@ SymmetricTensor::SymmetricTensor(const at::Tensor& local_tensor)
 }
 
 SymmetricTensor::~SymmetricTensor() {
+  if (device_peer_ptrs_ != nullptr) {
+    cudaFree(device_peer_ptrs_);
+    device_peer_ptrs_ = nullptr;
+  }
+
 #if (CUDA_VERSION >= 13000)
   if (is_multicast_setup_) {
     if (mc_base_ptr_) {
@@ -389,6 +394,24 @@ at::Tensor SymmetricTensor::remoteTensor(int64_t rank) const {
           .device(at::kCUDA, rank));
 }
 
+void** SymmetricTensor::devicePeerPointers() const {
+  NVF_CHECK(are_remote_tensors_setup_ == true, "Remote tensors not setup");
+  if (device_peer_ptrs_ == nullptr) {
+    std::vector<void*> host_peer_ptrs(world_size_);
+    for (int64_t rank = 0; rank < world_size_; ++rank) {
+      host_peer_ptrs[rank] = reinterpret_cast<void*>(remote_ptrs_[rank]);
+    }
+    NVFUSER_CUDA_RT_SAFE_CALL(
+        cudaMalloc(&device_peer_ptrs_, world_size_ * sizeof(void*)));
+    NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpy(
+        device_peer_ptrs_,
+        host_peer_ptrs.data(),
+        world_size_ * sizeof(void*),
+        cudaMemcpyHostToDevice));
+  }
+  return device_peer_ptrs_;
+}
+
 void* SymmetricTensor::multicastPtr() const {
   NVF_CHECK(is_multicast_setup_, "Multicast not setup");
   return mc_ptr_;

diff --git a/csrc/multidevice/symmetric_tensor.h b/csrc/multidevice/symmetric_tensor.h
@@ -51,6 +51,8 @@ class SymmetricTensor {
   // Setup remote access (lazy, init-once)
   void setupRemoteHandles(const std::string& tag = "");
   at::Tensor remoteTensor(int64_t rank) const;
+  // Returns a device pointer table of peer pointers (void** on device).
+  void** devicePeerPointers() const;
 
   // Setup multicast (CUDA 13.0+, init-once)
   void setupMulticast(int64_t exporter_rank, const std::string& tag = "");
@@ -79,6 +81,7 @@ class SymmetricTensor {
   int peer_fd_{-1};
   bool is_contiguous_view_setup_ = false;
   at::Tensor contiguous_view_;
+  mutable void** device_peer_ptrs_ = nullptr;
 };
 
 } // namespace nvfuser