From 3b0d5deab74b140e3cbcf8290a93f675afb265d9 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Tue, 9 Dec 2025 19:32:20 +0000 Subject: [PATCH 1/6] improved zero-copy allreduce with nvls --- apps/nccl/src/allreduce.cu | 14 +++++++++++--- apps/nccl/src/allreduce.hpp | 9 +++++++++ 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/apps/nccl/src/allreduce.cu b/apps/nccl/src/allreduce.cu index 257db2ccc..2cba2a7fe 100644 --- a/apps/nccl/src/allreduce.cu +++ b/apps/nccl/src/allreduce.cu @@ -79,7 +79,9 @@ struct NvlsAdapter { #endif { using ChannelType = mscclpp::DeviceHandle; - int nBlocks = nRanksPerNode; + cudaDeviceProp prop; + cudaGetDeviceProperties(&prop, 0); + int nBlocks = (prop.major == 10) ? 24 : nRanksPerNode; int nThreadsPerBlock = 1024; allreduce9<<>>((ChannelType*)memoryChannels, nvlsChannels, nvlsOutChannels, channelInOffset, channelOutOffset, @@ -331,7 +333,13 @@ mscclpp::Algorithm AllreducePacket::build() { void AllreduceNvls::initialize(std::shared_ptr comm, std::unordered_map>&) { - nSwitchChannels_ = 8; + cudaDeviceProp prop; + cudaGetDeviceProperties(&prop, 0); + if (prop.major == 10) { + nSwitchChannels_ = 24; + } else { + nSwitchChannels_ = 8; + } this->conns_ = setupConnections(comm); // setup semaphores std::vector> memorySemaphores = @@ -680,4 +688,4 @@ mscclpp::Algorithm AllreduceNvlsPacket::build() { return self->generateAllreduceContextKey(input, output, count, dtype); }); return allreduceAlgo; -} \ No newline at end of file +} diff --git a/apps/nccl/src/allreduce.hpp b/apps/nccl/src/allreduce.hpp index 82adc323b..b5f292bf9 100644 --- a/apps/nccl/src/allreduce.hpp +++ b/apps/nccl/src/allreduce.hpp @@ -821,7 +821,11 @@ __global__ void __launch_bounds__(1024, 1) int nBlocks = gridDim.x; int bid = blockIdx.x; size_t sizePerRank = size / nRanksPerNode; +#if __CUDA_ARCH__ >=1000 + size_t sizePerBlock = (sizePerRank / nBlocks) / 16 * 16; +#else size_t sizePerBlock = sizePerRank / nBlocks; +#endif size_t rankOffset = sizePerRank * rank; size_t blockOffset = sizePerBlock * bid + rankOffset; mscclpp::DeviceHandle* multicastPtr = multicast + bid; @@ -842,6 +846,11 @@ __global__ void __launch_bounds__(1024, 1) __syncthreads(); T* src = (T*)multicastPtr->mcPtr; T* dst = (T*)multicastOutPtr->mcPtr; +#if __CUDA_ARCH__ >= 1000 + if (bid == nBlocks - 1) { + sizePerBlock = sizePerRank - sizePerBlock * (nBlocks - 1); + } +#endif handleMultiLoadReduceStore(src, dst, blockOffset + channelInOffset, blockOffset + channelOutOffset, sizePerBlock, threadIdx.x, blockDim.x); __syncthreads(); From 72d8f60594319d4fca8f61f7cceefb38ea96b72c Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Tue, 9 Dec 2025 20:02:48 +0000 Subject: [PATCH 2/6] clean up the code --- apps/nccl/src/allreduce.cu | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/apps/nccl/src/allreduce.cu b/apps/nccl/src/allreduce.cu index 2cba2a7fe..12f10a364 100644 --- a/apps/nccl/src/allreduce.cu +++ b/apps/nccl/src/allreduce.cu @@ -335,11 +335,7 @@ void AllreduceNvls::initialize(std::shared_ptr comm, std::unordered_map>&) { cudaDeviceProp prop; cudaGetDeviceProperties(&prop, 0); - if (prop.major == 10) { - nSwitchChannels_ = 24; - } else { - nSwitchChannels_ = 8; - } + nSwitchChannels_ = (prop.major == 10) ? 24 : 8; this->conns_ = setupConnections(comm); // setup semaphores std::vector> memorySemaphores = From ec622a17def2b9e19c0ef7fbc5ef4d94fa9c5fd1 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Wed, 17 Dec 2025 20:41:17 +0000 Subject: [PATCH 3/6] add some checkings, fix indentation, and add some explanations --- apps/nccl/src/allreduce.cu | 6 ++++-- apps/nccl/src/allreduce.hpp | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/apps/nccl/src/allreduce.cu b/apps/nccl/src/allreduce.cu index 12f10a364..e8d30afc0 100644 --- a/apps/nccl/src/allreduce.cu +++ b/apps/nccl/src/allreduce.cu @@ -80,7 +80,8 @@ struct NvlsAdapter { { using ChannelType = mscclpp::DeviceHandle; cudaDeviceProp prop; - cudaGetDeviceProperties(&prop, 0); + MSCCLPP_CUTHROW(cudaGetDeviceProperties(&prop, 0)); + // On GB200, the optimal number of blocks depends on the GPU issue rate + NVLink switch reduction capacity which is 24 here int nBlocks = (prop.major == 10) ? 24 : nRanksPerNode; int nThreadsPerBlock = 1024; allreduce9<<>>((ChannelType*)memoryChannels, nvlsChannels, @@ -334,7 +335,8 @@ mscclpp::Algorithm AllreducePacket::build() { void AllreduceNvls::initialize(std::shared_ptr comm, std::unordered_map>&) { cudaDeviceProp prop; - cudaGetDeviceProperties(&prop, 0); + MSCCLPP_CUTHROW(cudaGetDeviceProperties(&prop, 0)); + // On GB200, the optimal number of blocks depends on the GPU issue rate + NVLink switch reduction capacity which is 24 here nSwitchChannels_ = (prop.major == 10) ? 24 : 8; this->conns_ = setupConnections(comm); // setup semaphores diff --git a/apps/nccl/src/allreduce.hpp b/apps/nccl/src/allreduce.hpp index b5f292bf9..b9db69785 100644 --- a/apps/nccl/src/allreduce.hpp +++ b/apps/nccl/src/allreduce.hpp @@ -821,7 +821,7 @@ __global__ void __launch_bounds__(1024, 1) int nBlocks = gridDim.x; int bid = blockIdx.x; size_t sizePerRank = size / nRanksPerNode; -#if __CUDA_ARCH__ >=1000 +#if __CUDA_ARCH__ >= 1000 size_t sizePerBlock = (sizePerRank / nBlocks) / 16 * 16; #else size_t sizePerBlock = sizePerRank / nBlocks; From 95fac3d97cf934599fa93f1c8259f37e35a0b4cd Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Wed, 17 Dec 2025 20:53:57 +0000 Subject: [PATCH 4/6] comply with clang format --- apps/nccl/src/allreduce.cu | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/apps/nccl/src/allreduce.cu b/apps/nccl/src/allreduce.cu index e8d30afc0..a3cf0da23 100644 --- a/apps/nccl/src/allreduce.cu +++ b/apps/nccl/src/allreduce.cu @@ -81,7 +81,8 @@ struct NvlsAdapter { using ChannelType = mscclpp::DeviceHandle; cudaDeviceProp prop; MSCCLPP_CUTHROW(cudaGetDeviceProperties(&prop, 0)); - // On GB200, the optimal number of blocks depends on the GPU issue rate + NVLink switch reduction capacity which is 24 here + // On GB200, the optimal number of blocks depends on the GPU issue rate + + // NVLink switch reduction capacity, which is 24 here int nBlocks = (prop.major == 10) ? 24 : nRanksPerNode; int nThreadsPerBlock = 1024; allreduce9<<>>((ChannelType*)memoryChannels, nvlsChannels, @@ -336,7 +337,8 @@ void AllreduceNvls::initialize(std::shared_ptr comm, std::unordered_map>&) { cudaDeviceProp prop; MSCCLPP_CUTHROW(cudaGetDeviceProperties(&prop, 0)); - // On GB200, the optimal number of blocks depends on the GPU issue rate + NVLink switch reduction capacity which is 24 here + // On GB200, the optimal number of blocks depends on the GPU issue rate + + // NVLink switch reduction capacity, which is 24 here nSwitchChannels_ = (prop.major == 10) ? 24 : 8; this->conns_ = setupConnections(comm); // setup semaphores From 3592800b4b09821c694ec9ac8cabfe38067db602 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Wed, 17 Dec 2025 21:26:11 +0000 Subject: [PATCH 5/6] check cuda api ret value with MSCCLPP_CUDATHROW --- apps/nccl/src/allreduce.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/nccl/src/allreduce.cu b/apps/nccl/src/allreduce.cu index a3cf0da23..e1d160995 100644 --- a/apps/nccl/src/allreduce.cu +++ b/apps/nccl/src/allreduce.cu @@ -80,7 +80,7 @@ struct NvlsAdapter { { using ChannelType = mscclpp::DeviceHandle; cudaDeviceProp prop; - MSCCLPP_CUTHROW(cudaGetDeviceProperties(&prop, 0)); + MSCCLPP_CUDATHROW(cudaGetDeviceProperties(&prop, 0)); // On GB200, the optimal number of blocks depends on the GPU issue rate + // NVLink switch reduction capacity, which is 24 here int nBlocks = (prop.major == 10) ? 24 : nRanksPerNode; @@ -336,7 +336,7 @@ mscclpp::Algorithm AllreducePacket::build() { void AllreduceNvls::initialize(std::shared_ptr comm, std::unordered_map>&) { cudaDeviceProp prop; - MSCCLPP_CUTHROW(cudaGetDeviceProperties(&prop, 0)); + MSCCLPP_CUDATHROW(cudaGetDeviceProperties(&prop, 0)); // On GB200, the optimal number of blocks depends on the GPU issue rate + // NVLink switch reduction capacity, which is 24 here nSwitchChannels_ = (prop.major == 10) ? 24 : 8; From 112868539622b931e17cb4370711e1659b743804 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Wed, 17 Dec 2025 14:14:38 -0800 Subject: [PATCH 6/6] Add NCCL directory to CodeQL analysis paths --- .github/workflows/codeql-analysis.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml index b423e3265..562d9c38c 100644 --- a/.github/workflows/codeql-analysis.yml +++ b/.github/workflows/codeql-analysis.yml @@ -10,6 +10,7 @@ on: - 'src/**' - 'include/**' - 'CMakeLists.txt' + - 'apps/nccl/**' - '.github/workflows/codeql-analysis.yml' pull_request: branches: @@ -20,6 +21,7 @@ on: - 'src/**' - 'include/**' - 'CMakeLists.txt' + - 'apps/nccl/**' - '.github/workflows/codeql-analysis.yml' schedule: - cron: "30 1 * * 1"