From 3b0d5deab74b140e3cbcf8290a93f675afb265d9 Mon Sep 17 00:00:00 2001
From: Ubuntu <mahdiehghazi@microsoft.com>
Date: Tue, 9 Dec 2025 19:32:20 +0000
Subject: [PATCH 1/6] improved zero-copy allreduce with nvls

---
 apps/nccl/src/allreduce.cu  | 14 +++++++++++---
 apps/nccl/src/allreduce.hpp |  9 +++++++++
 2 files changed, 20 insertions(+), 3 deletions(-)
diff --git a/apps/nccl/src/allreduce.cu b/apps/nccl/src/allreduce.cu
index 257db2ccc..2cba2a7fe 100644
--- a/apps/nccl/src/allreduce.cu
+++ b/apps/nccl/src/allreduce.cu
@@ -79,7 +79,9 @@ struct NvlsAdapter {
 #endif
     {
       using ChannelType = mscclpp::DeviceHandle<mscclpp::BaseMemoryChannel>;
-      int nBlocks = nRanksPerNode;
+      cudaDeviceProp prop;
+      cudaGetDeviceProperties(&prop, 0);
+      int nBlocks = (prop.major == 10) ? 24 : nRanksPerNode;
       int nThreadsPerBlock = 1024;
       allreduce9<T><<<nBlocks, nThreadsPerBlock, 0, stream>>>((ChannelType*)memoryChannels, nvlsChannels,
                                                               nvlsOutChannels, channelInOffset, channelOutOffset,
@@ -331,7 +333,13 @@ mscclpp::Algorithm AllreducePacket::build() {
 
 void AllreduceNvls::initialize(std::shared_ptr<mscclpp::Communicator> comm,
                                std::unordered_map<std::string, std::shared_ptr<void>>&) {
-  nSwitchChannels_ = 8;
+  cudaDeviceProp prop;
+  cudaGetDeviceProperties(&prop, 0);
+  if (prop.major == 10) {
+    nSwitchChannels_ = 24;
+  } else {
+    nSwitchChannels_ = 8;
+  }
   this->conns_ = setupConnections(comm);
   // setup semaphores
   std::vector<std::shared_ptr<mscclpp::MemoryDevice2DeviceSemaphore>> memorySemaphores =
@@ -680,4 +688,4 @@ mscclpp::Algorithm AllreduceNvlsPacket::build() {
         return self->generateAllreduceContextKey(input, output, count, dtype);
       });
   return allreduceAlgo;
-}
\ No newline at end of file
+}
diff --git a/apps/nccl/src/allreduce.hpp b/apps/nccl/src/allreduce.hpp
index 82adc323b..b5f292bf9 100644
--- a/apps/nccl/src/allreduce.hpp
+++ b/apps/nccl/src/allreduce.hpp
@@ -821,7 +821,11 @@ __global__ void __launch_bounds__(1024, 1)
   int nBlocks = gridDim.x;
   int bid = blockIdx.x;
   size_t sizePerRank = size / nRanksPerNode;
+#if __CUDA_ARCH__ >=1000
+  size_t sizePerBlock = (sizePerRank / nBlocks) / 16 * 16;
+#else
   size_t sizePerBlock = sizePerRank / nBlocks;
+#endif
   size_t rankOffset = sizePerRank * rank;
   size_t blockOffset = sizePerBlock * bid + rankOffset;
   mscclpp::DeviceHandle<mscclpp::SwitchChannel>* multicastPtr = multicast + bid;
@@ -842,6 +846,11 @@ __global__ void __launch_bounds__(1024, 1)
   __syncthreads();
   T* src = (T*)multicastPtr->mcPtr;
   T* dst = (T*)multicastOutPtr->mcPtr;
+#if __CUDA_ARCH__ >= 1000
+  if (bid == nBlocks - 1) {
+    sizePerBlock = sizePerRank - sizePerBlock * (nBlocks - 1);
+  }
+#endif
   handleMultiLoadReduceStore(src, dst, blockOffset + channelInOffset, blockOffset + channelOutOffset, sizePerBlock,
                              threadIdx.x, blockDim.x);
   __syncthreads();

From 72d8f60594319d4fca8f61f7cceefb38ea96b72c Mon Sep 17 00:00:00 2001
From: Ubuntu <mahdiehghazi@microsoft.com>
Date: Tue, 9 Dec 2025 20:02:48 +0000
Subject: [PATCH 2/6] clean up the code

---
 apps/nccl/src/allreduce.cu | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/apps/nccl/src/allreduce.cu b/apps/nccl/src/allreduce.cu
index 2cba2a7fe..12f10a364 100644
--- a/apps/nccl/src/allreduce.cu
+++ b/apps/nccl/src/allreduce.cu
@@ -335,11 +335,7 @@ void AllreduceNvls::initialize(std::shared_ptr<mscclpp::Communicator> comm,
                                std::unordered_map<std::string, std::shared_ptr<void>>&) {
   cudaDeviceProp prop;
   cudaGetDeviceProperties(&prop, 0);
-  if (prop.major == 10) {
-    nSwitchChannels_ = 24;
-  } else {
-    nSwitchChannels_ = 8;
-  }
+  nSwitchChannels_ = (prop.major == 10) ? 24 : 8;
   this->conns_ = setupConnections(comm);
   // setup semaphores
   std::vector<std::shared_ptr<mscclpp::MemoryDevice2DeviceSemaphore>> memorySemaphores =

From ec622a17def2b9e19c0ef7fbc5ef4d94fa9c5fd1 Mon Sep 17 00:00:00 2001
From: Ubuntu <mahdieghazi@microsoft.com>
Date: Wed, 17 Dec 2025 20:41:17 +0000
Subject: [PATCH 3/6] add some checkings, fix indentation, and add some
 explanations

---
 apps/nccl/src/allreduce.cu  | 6 ++++--
 apps/nccl/src/allreduce.hpp | 2 +-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/apps/nccl/src/allreduce.cu b/apps/nccl/src/allreduce.cu
index 12f10a364..e8d30afc0 100644
--- a/apps/nccl/src/allreduce.cu
+++ b/apps/nccl/src/allreduce.cu
@@ -80,7 +80,8 @@ struct NvlsAdapter {
     {
       using ChannelType = mscclpp::DeviceHandle<mscclpp::BaseMemoryChannel>;
       cudaDeviceProp prop;
-      cudaGetDeviceProperties(&prop, 0);
+      MSCCLPP_CUTHROW(cudaGetDeviceProperties(&prop, 0));
+      // On GB200, the optimal number of blocks depends on the GPU issue rate + NVLink switch reduction capacity which is 24 here
       int nBlocks = (prop.major == 10) ? 24 : nRanksPerNode;
       int nThreadsPerBlock = 1024;
       allreduce9<T><<<nBlocks, nThreadsPerBlock, 0, stream>>>((ChannelType*)memoryChannels, nvlsChannels,
@@ -334,7 +335,8 @@ mscclpp::Algorithm AllreducePacket::build() {
 void AllreduceNvls::initialize(std::shared_ptr<mscclpp::Communicator> comm,
                                std::unordered_map<std::string, std::shared_ptr<void>>&) {
   cudaDeviceProp prop;
-  cudaGetDeviceProperties(&prop, 0);
+  MSCCLPP_CUTHROW(cudaGetDeviceProperties(&prop, 0));
+  // On GB200, the optimal number of blocks depends on the GPU issue rate + NVLink switch reduction capacity which is 24 here
   nSwitchChannels_ = (prop.major == 10) ? 24 : 8;
   this->conns_ = setupConnections(comm);
   // setup semaphores
diff --git a/apps/nccl/src/allreduce.hpp b/apps/nccl/src/allreduce.hpp
index b5f292bf9..b9db69785 100644
--- a/apps/nccl/src/allreduce.hpp
+++ b/apps/nccl/src/allreduce.hpp
@@ -821,7 +821,7 @@ __global__ void __launch_bounds__(1024, 1)
   int nBlocks = gridDim.x;
   int bid = blockIdx.x;
   size_t sizePerRank = size / nRanksPerNode;
-#if __CUDA_ARCH__ >=1000
+#if __CUDA_ARCH__ >= 1000
   size_t sizePerBlock = (sizePerRank / nBlocks) / 16 * 16;
 #else
   size_t sizePerBlock = sizePerRank / nBlocks;

From 95fac3d97cf934599fa93f1c8259f37e35a0b4cd Mon Sep 17 00:00:00 2001
From: Ubuntu <mahdieghazi@microsoft.com>
Date: Wed, 17 Dec 2025 20:53:57 +0000
Subject: [PATCH 4/6] comply with clang format

---
 apps/nccl/src/allreduce.cu | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/apps/nccl/src/allreduce.cu b/apps/nccl/src/allreduce.cu
index e8d30afc0..a3cf0da23 100644
--- a/apps/nccl/src/allreduce.cu
+++ b/apps/nccl/src/allreduce.cu
@@ -81,7 +81,8 @@ struct NvlsAdapter {
       using ChannelType = mscclpp::DeviceHandle<mscclpp::BaseMemoryChannel>;
       cudaDeviceProp prop;
       MSCCLPP_CUTHROW(cudaGetDeviceProperties(&prop, 0));
-      // On GB200, the optimal number of blocks depends on the GPU issue rate + NVLink switch reduction capacity which is 24 here
+      // On GB200, the optimal number of blocks depends on the GPU issue rate +
+      // NVLink switch reduction capacity, which is 24 here
       int nBlocks = (prop.major == 10) ? 24 : nRanksPerNode;
       int nThreadsPerBlock = 1024;
       allreduce9<T><<<nBlocks, nThreadsPerBlock, 0, stream>>>((ChannelType*)memoryChannels, nvlsChannels,
@@ -336,7 +337,8 @@ void AllreduceNvls::initialize(std::shared_ptr<mscclpp::Communicator> comm,
                                std::unordered_map<std::string, std::shared_ptr<void>>&) {
   cudaDeviceProp prop;
   MSCCLPP_CUTHROW(cudaGetDeviceProperties(&prop, 0));
-  // On GB200, the optimal number of blocks depends on the GPU issue rate + NVLink switch reduction capacity which is 24 here
+  // On GB200, the optimal number of blocks depends on the GPU issue rate +
+  // NVLink switch reduction capacity, which is 24 here
   nSwitchChannels_ = (prop.major == 10) ? 24 : 8;
   this->conns_ = setupConnections(comm);
   // setup semaphores

From 3592800b4b09821c694ec9ac8cabfe38067db602 Mon Sep 17 00:00:00 2001
From: Ubuntu <mahdieghazi@microsoft.com>
Date: Wed, 17 Dec 2025 21:26:11 +0000
Subject: [PATCH 5/6] check cuda api ret value with MSCCLPP_CUDATHROW

---
 apps/nccl/src/allreduce.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/apps/nccl/src/allreduce.cu b/apps/nccl/src/allreduce.cu
index a3cf0da23..e1d160995 100644
--- a/apps/nccl/src/allreduce.cu
+++ b/apps/nccl/src/allreduce.cu
@@ -80,7 +80,7 @@ struct NvlsAdapter {
     {
       using ChannelType = mscclpp::DeviceHandle<mscclpp::BaseMemoryChannel>;
       cudaDeviceProp prop;
-      MSCCLPP_CUTHROW(cudaGetDeviceProperties(&prop, 0));
+      MSCCLPP_CUDATHROW(cudaGetDeviceProperties(&prop, 0));
       // On GB200, the optimal number of blocks depends on the GPU issue rate +
       // NVLink switch reduction capacity, which is 24 here
       int nBlocks = (prop.major == 10) ? 24 : nRanksPerNode;
@@ -336,7 +336,7 @@ mscclpp::Algorithm AllreducePacket::build() {
 void AllreduceNvls::initialize(std::shared_ptr<mscclpp::Communicator> comm,
                                std::unordered_map<std::string, std::shared_ptr<void>>&) {
   cudaDeviceProp prop;
-  MSCCLPP_CUTHROW(cudaGetDeviceProperties(&prop, 0));
+  MSCCLPP_CUDATHROW(cudaGetDeviceProperties(&prop, 0));
   // On GB200, the optimal number of blocks depends on the GPU issue rate +
   // NVLink switch reduction capacity, which is 24 here
   nSwitchChannels_ = (prop.major == 10) ? 24 : 8;

From 112868539622b931e17cb4370711e1659b743804 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Wed, 17 Dec 2025 14:14:38 -0800
Subject: [PATCH 6/6] Add NCCL directory to CodeQL analysis paths

---
 .github/workflows/codeql-analysis.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml
index b423e3265..562d9c38c 100644
--- a/.github/workflows/codeql-analysis.yml
+++ b/.github/workflows/codeql-analysis.yml
@@ -10,6 +10,7 @@ on:
       - 'src/**'
       - 'include/**'
       - 'CMakeLists.txt'
+      - 'apps/nccl/**'
       - '.github/workflows/codeql-analysis.yml'
   pull_request:
     branches:
@@ -20,6 +21,7 @@ on:
       - 'src/**'
       - 'include/**'
       - 'CMakeLists.txt'
+      - 'apps/nccl/**'
       - '.github/workflows/codeql-analysis.yml'
   schedule:
     - cron: "30 1 * * 1"