Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
e507021
perf comparison between torch, fuser, and runtime TMA local matmul ops
samnordmann Feb 3, 2026
6465068
first fused comm/compute kernel for AG+GEMM
samnordmann Feb 12, 2026
dd2e7ce
add baseline torch eager nccl
samnordmann Feb 12, 2026
90366f6
add baseline torch eager cuda
samnordmann Feb 12, 2026
dda6dd1
centralize benchmark options, add time measurement mode and add_barri…
samnordmann Feb 12, 2026
f9e7985
cleanup
samnordmann Feb 12, 2026
08e6bb5
add fused staged kernels without overlap
samnordmann Feb 12, 2026
d5b78ab
fix race condition in multimem by using semaphores
samnordmann Feb 12, 2026
b9240f7
fix race condition in entering all kernels by using semaphores
samnordmann Feb 12, 2026
70b7ff0
refactor
samnordmann Feb 12, 2026
ab292b4
refactor kernels and do p2p wait
samnordmann Feb 12, 2026
1c53396
add matmulTma support
samnordmann Feb 12, 2026
bea9cbf
avoid copy_ the output
samnordmann Feb 12, 2026
fd0647d
renaming
samnordmann Feb 12, 2026
7a14b2a
only keep strongly synchronizing implementations
samnordmann Feb 12, 2026
579a945
add fused tma impl
samnordmann Feb 12, 2026
a747786
add variant with chunk-signaled async CUTLASS matmul
samnordmann Feb 18, 2026
1b9213a
remove local matmul perf test
samnordmann Feb 19, 2026
4d1bad0
major rewriting for clarity
samnordmann Feb 19, 2026
1c82f61
add header and batch cuda events timing
samnordmann Feb 23, 2026
93271bf
Use in-place matmulTma
samnordmann Feb 23, 2026
d61aed5
renaming and clarification
samnordmann Feb 23, 2026
4b08cd0
lint
samnordmann Feb 23, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 10 additions & 4 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -399,19 +399,20 @@ endif()
# "private" (not installed) static library.
add_library(codegen_internal OBJECT ${NVFUSER_SRCS})


if(NOT MSVC)
if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
target_compile_options(codegen_internal PRIVATE
-Wall -Wno-unused-function -Werror
$<$<COMPILE_LANGUAGE:CXX>:-Wall -Wno-unused-function -Werror

# These warnings are not treated as errors because of gcc 12.2 used in
# manylinux image. consider enable this when we upgrade.
# linking comment:
# https://github.com/NVIDIA/Fuser/pull/3001#discussion_r1772551266
-Wno-error=restrict -Wno-error=stringop-overflow -Wno-error=maybe-uninitialized)
-Wno-error=restrict -Wno-error=stringop-overflow -Wno-error=maybe-uninitialized>)
else()
target_compile_options(codegen_internal PRIVATE
-Wall -Wno-unused-function -Werror)
$<$<COMPILE_LANGUAGE:CXX>:-Wall -Wno-unused-function -Werror>)
endif()
endif()

Expand All @@ -423,6 +424,9 @@ if (NVMMH_FOUND)
endif()
target_include_directories(codegen_internal SYSTEM PUBLIC
${CMAKE_SOURCE_DIR}/third_party/flatbuffers/include
${NVFUSER_THIRD_PARTY_DIR}/cutlass/include
${NVFUSER_THIRD_PARTY_DIR}/cutlass/tools/util/include
/usr/local/cuda/include/cccl
PRIVATE
${CUDA_INCLUDE_DIRS}
)
Expand Down Expand Up @@ -919,7 +923,7 @@ function(add_test_without_main TEST_NAME TEST_SRC ADDITIONAL_LINK)

if(NOT MSVC)
target_compile_options(${TEST_NAME} PRIVATE
-Wall -Wno-unused-function -Werror
$<$<COMPILE_LANGUAGE:CXX>:-Wall -Wno-unused-function -Werror>
)
endif()
endfunction()
Expand Down Expand Up @@ -1019,6 +1023,8 @@ if(BUILD_TEST)
${NVFUSER_ROOT}/tests/cpp/test_multidevice_host_ir.cpp
${NVFUSER_ROOT}/tests/cpp/test_multidevice_host_ir_overlap.cpp
${NVFUSER_ROOT}/tests/cpp/test_multidevice_ipc.cpp
${NVFUSER_ROOT}/tests/cpp/test_multidevice_fused_remote_matmul.cpp
${NVFUSER_ROOT}/tests/cpp/test_multidevice_fused_remote_matmul_kernel.cu
${NVFUSER_ROOT}/tests/cpp/test_multidevice_lower_communication.cpp
${NVFUSER_ROOT}/tests/cpp/test_multidevice_lower_communication_cuda.cpp
${NVFUSER_ROOT}/tests/cpp/test_multidevice_matmul.cpp
Expand Down
23 changes: 23 additions & 0 deletions csrc/multidevice/symmetric_tensor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,11 @@ SymmetricTensor::SymmetricTensor(const at::Tensor& local_tensor)
}

SymmetricTensor::~SymmetricTensor() {
if (device_peer_ptrs_ != nullptr) {
cudaFree(device_peer_ptrs_);
device_peer_ptrs_ = nullptr;
}

#if (CUDA_VERSION >= 13000)
if (is_multicast_setup_) {
if (mc_base_ptr_) {
Expand Down Expand Up @@ -389,6 +394,24 @@ at::Tensor SymmetricTensor::remoteTensor(int64_t rank) const {
.device(at::kCUDA, rank));
}

void** SymmetricTensor::devicePeerPointers() const {
NVF_CHECK(are_remote_tensors_setup_ == true, "Remote tensors not setup");
if (device_peer_ptrs_ == nullptr) {
std::vector<void*> host_peer_ptrs(world_size_);
for (int64_t rank = 0; rank < world_size_; ++rank) {
host_peer_ptrs[rank] = reinterpret_cast<void*>(remote_ptrs_[rank]);
}
NVFUSER_CUDA_RT_SAFE_CALL(
cudaMalloc(&device_peer_ptrs_, world_size_ * sizeof(void*)));
NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpy(
device_peer_ptrs_,
host_peer_ptrs.data(),
world_size_ * sizeof(void*),
cudaMemcpyHostToDevice));
}
return device_peer_ptrs_;
}

void* SymmetricTensor::multicastPtr() const {
NVF_CHECK(is_multicast_setup_, "Multicast not setup");
return mc_ptr_;
Expand Down
3 changes: 3 additions & 0 deletions csrc/multidevice/symmetric_tensor.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ class SymmetricTensor {
// Setup remote access (lazy, init-once)
void setupRemoteHandles(const std::string& tag = "");
at::Tensor remoteTensor(int64_t rank) const;
// Returns a device pointer table of peer pointers (void** on device).
void** devicePeerPointers() const;

// Setup multicast (CUDA 13.0+, init-once)
void setupMulticast(int64_t exporter_rank, const std::string& tag = "");
Expand Down Expand Up @@ -79,6 +81,7 @@ class SymmetricTensor {
int peer_fd_{-1};
bool is_contiguous_view_setup_ = false;
at::Tensor contiguous_view_;
mutable void** device_peer_ptrs_ = nullptr;
};

} // namespace nvfuser
Loading
Loading