NVIDIA · charan-003 · May 21, 2026 · May 22, 2026 · May 22, 2026 · May 22, 2026
@@ -32,6 +32,8 @@ benchmarks:
       # Examples:
       # - '^cub\.bench\.for_each\.base'
       # - '^cub\.bench\.reduce\.(sum|min)\.'
+      - '^cub\.bench\.reduce\.block_reduce_warp_reductions_sum\.'
+      - '^cub\.bench\.reduce\.warp_reduce_(sum|min)\.'
 
     # Python benchmark filters (regex matched against paths under benchmarks/).
     python:
@@ -44,11 +46,11 @@ benchmarks:
   gpus:
     # - "t4"         # sm_75, 16 GB
     # - "rtx2080"    # sm_75,  8 GB
-    # - "rtxa6000"   # sm_86, 48 GB
+    - "rtxa6000"   # sm_86, 48 GB
     # - "l4"         # sm_89, 24 GB
     # - "rtx4090"    # sm_89, 24 GB
-    # - "h100"       # sm_90, 80 GB
-    # - "rtxpro6000" # sm_120
+    - "h100"       # sm_90, 80 GB
+    - "rtxpro6000" # sm_120
 
   # Extra .devcontainer/launch.sh -d args
   # launch_args: "--cuda 13.2 --host gcc14"

@@ -0,0 +1,58 @@
+// SPDX-FileCopyrightText: Copyright (c) 2011-2026, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: BSD-3
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#include <cub/block/block_reduce.cuh>
+
+#include <cuda_runtime_api.h>
+#include <device_side_benchmark.cuh>
+#include <nvbench_helper.cuh>
+
+template <int BlockThreads>
+struct benchmark_op_t
+{
+  template <typename T>
+  __device__ __forceinline__ T operator()(T thread_data) const
+  {
+    using BlockReduce = cub::BlockReduce<T, BlockThreads, cub::BLOCK_REDUCE_WARP_REDUCTIONS>;
+    using TempStorage = typename BlockReduce::TempStorage;
+    __shared__ TempStorage temp_storage;
+    T agg = BlockReduce{temp_storage}.Reduce(thread_data, op_t{});
+    // Re-broadcast so every thread depends on the reduction result, preventing DCE.
+    __shared__ T broadcast;
+    if (threadIdx.x == 0)
+    {
+      broadcast = agg;
+    }
+    __syncthreads();
+    return broadcast;
+  }
+};
+
+template <typename T>
+void block_reduce_warp_reductions(nvbench::state& state, nvbench::type_list<T>)
+{
+  constexpr int block_size    = 256; // 8 warps -> exercises optimized ApplyWarpAggregates
+  constexpr int unroll_factor = 32; // compromise between compile time and noise
+  using action_t              = benchmark_op_t<block_size>;
+  const auto& kernel          = benchmark_kernel<block_size, unroll_factor, action_t, T>;
+  const int num_SMs           = state.get_device().value().get_number_of_sms();
+  int max_blocks_per_SM       = 0;
+  NVBENCH_CUDA_CALL_NOEXCEPT(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_SM, kernel, block_size, 0));
+  const int grid_size = max_blocks_per_SM * num_SMs;
+  if (grid_size == 0)
+  {
+    state.skip("Kernel occupancy is zero for this type/configuration.");
+    return;
+  }
+  state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch&) {
+    kernel<<<grid_size, block_size>>>(action_t{});
+  });
+}
+
+NVBENCH_BENCH_TYPES(block_reduce_warp_reductions, NVBENCH_TYPE_AXES(value_types))
+  .set_name("base")
+  .set_type_axes_names({"T{ct}"});
@@ -0,0 +1,35 @@
+// SPDX-FileCopyrightText: Copyright (c) 2011-2026, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: BSD-3
+
+#include <nvbench_helper.cuh>
+
+using value_types = nvbench::type_list<
+  int8_t,
+  int16_t,
+  int32_t,
+  int64_t,
+#if _CCCL_HAS_INT128()
+  int128_t,
+#endif
+#if _CCCL_HAS_NVFP16() && _CCCL_CTK_AT_LEAST(12, 2)
+  __half,
+#endif
+#if _CCCL_HAS_NVBF16() && _CCCL_CTK_AT_LEAST(12, 2)
+  __nv_bfloat16,
+#endif
+  float,
+  double,
+#if _CCCL_HAS_FLOAT128()
+  __float128,
+#endif
+#if _CCCL_HAS_NVFP16() && _CCCL_CTK_AT_LEAST(12, 2)
+  cuda::std::complex<__half>,
+#endif
+#if _CCCL_HAS_NVBF16() && _CCCL_CTK_AT_LEAST(12, 2)
+  cuda::std::complex<__nv_bfloat16>,
+#endif
+  cuda::std::complex<float>,
+  cuda::std::complex<double>>;
+
+using op_t = ::cuda::std::plus<>;
+#include "block_reduce_warp_reductions_base.cuh"
@@ -33,6 +33,11 @@ void warp_reduce(nvbench::state& state, nvbench::type_list<T>)
   int max_blocks_per_SM       = 0;
   NVBENCH_CUDA_CALL_NOEXCEPT(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_SM, kernel, block_size, 0));
   const int grid_size = max_blocks_per_SM * num_SMs;
+  if (grid_size == 0)
+  {
+    state.skip("Kernel occupancy is zero for this type/configuration.");
+    return;
+  }
   state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch&) {
     kernel<<<grid_size, block_size>>>(benchmark_op_t{});
   });

@@ -21,13 +21,18 @@
 #endif // no system header
 
 #include <cub/detail/uninitialized_copy.cuh>
+#include <cub/thread/thread_operators.cuh>
 #include <cub/util_ptx.cuh>
 #include <cub/warp/warp_reduce.cuh>
 
 #include <cuda/__cmath/ceil_div.h>
+#include <cuda/__cmath/pow2.h>
+#include <cuda/__functional/operator_properties.h>
 #include <cuda/__ptx/instructions/get_sreg.h>
-#include <cuda/atomic>
 #include <cuda/std/__algorithm/min.h>
+#include <cuda/std/__bit/integral.h>
+
+#include <nv/target>
 
 CUB_NAMESPACE_BEGIN
 namespace detail
@@ -97,46 +102,10 @@ struct BlockReduceWarpReductions
   {}
 
   //! @rst
-  //! Returns block-wide aggregate in *thread*\ :sub:`0`.
-  //! @endrst
-  //!
-  //! @tparam ReductionOp
-  //!   **[inferred]** Binary reduction operator type
-  //!
-  //! @param[in] reduction_op
-  //!   Binary reduction operator
+  //! Cooperatively reduces warp aggregates.
   //!
-  //! @param[in] warp_aggregate
-  //!   **[**\ *lane*\ :sub:`0` **only]** Warp-wide aggregate reduction of input items
-  template <typename ReductionOp>
-  _CCCL_DEVICE _CCCL_FORCEINLINE T ApplyWarpAggregatesNonDeterministic(ReductionOp reduction_op, T warp_aggregate)
-  {
-    if (linear_tid == 0)
-    {
-      detail::uninitialized_copy_single(temp_storage.warp_aggregates, warp_aggregate);
-    }
-
-    __syncthreads();
-
-    // Warp 0 already contributed its aggregate above since its also linear_tid == 0
-    if (lane_id == 0 && warp_id != 0)
-    {
-      // TODO: replace this with other atomic operations when specified
-      NV_IF_ELSE_TARGET(
-        NV_PROVIDES_SM_60,
-        ({
-          ::cuda::atomic_ref<T, ::cuda::thread_scope_block> atomic_target(temp_storage.warp_aggregates[0]);
-          atomic_target.fetch_add(warp_aggregate, ::cuda::memory_order_relaxed);
-        }),
-        (atomicAdd(&temp_storage.warp_aggregates[0], warp_aggregate);));
-    }
-
-    __syncthreads();
-    return temp_storage.warp_aggregates[0];
-  }
-
-  //! @rst
-  //! Recursively applies warp aggregates using template unrolling for deterministic reduction.
+  //! For small blocks (few warps) the reduction is performed sequentially by ``linear_tid == 0``.
+  //! For larger blocks, warp 0 reduces the warp aggregates in parallel:
   //! @endrst
   //!
   //! @tparam FullTile
@@ -155,21 +124,69 @@ struct BlockReduceWarpReductions
 
     __syncthreads();
 
-    // Update total aggregate in warp 0, lane 0
-    if (linear_tid == 0)
+    // Below this number of warps the parallel warp-0 reduction is not worthwhile compared to a
+    // single-thread sequential loop over the warp aggregates.
+    constexpr int small_block_warp_threshold = 8;
+
+    if constexpr (warps < small_block_warp_threshold)
     {
-      _CCCL_PRAGMA_UNROLL_FULL()
-      for (int warp_idx = 1; warp_idx < warps; ++warp_idx)
+      // Sequential reduction in linear_tid == 0 (legacy path for small blocks).
+      if (linear_tid == 0)
       {
-        if (FullTile || (warp_idx * logical_warp_size < num_valid))
+        _CCCL_PRAGMA_UNROLL_FULL()
+        for (int warp_idx = 1; warp_idx < warps; ++warp_idx)
         {
-          T addend       = temp_storage.warp_aggregates[warp_idx];
-          warp_aggregate = reduction_op(warp_aggregate, addend);
+          if (FullTile || (warp_idx * logical_warp_size < num_valid))
+          {
+            warp_aggregate = reduction_op(warp_aggregate, temp_storage.warp_aggregates[warp_idx]);
+          }
         }
       }
+      return warp_aggregate;
     }
+    else
+    {
+      // Parallel reduction in warp 0.
+      if (warp_id == 0)
+      {
+        const int num_warps = FullTile ? int(warps) : static_cast<int>(::cuda::ceil_div(num_valid, logical_warp_size));
+
+        if constexpr (is_redux_enabled_cuda_operator<ReductionOp, T>)
+        {
+          static_assert(::cuda::has_identity_element_v<ReductionOp, T>,
+                        "REDUX-eligible operators must have an identity element");
+          NV_IF_TARGET(NV_PROVIDES_SM_80, ({
+                         const T id  = ::cuda::identity_element<ReductionOp, T>();
+                         const T val = (lane_id < num_warps) ? temp_storage.warp_aggregates[lane_id] : id;
+                         return reduce_op_sync(val, 0xFFFFFFFFu, reduction_op);
+                       }))
+        }
+
+        // Shuffle-based tree over bit_ceil(warps) lanes.
+        constexpr int logical_lanes = static_cast<int>(::cuda::std::bit_ceil(static_cast<unsigned>(warps)));
 
-    return warp_aggregate;
+        T val;
+        constexpr bool has_identity = ::cuda::has_identity_element_v<ReductionOp, T>;
+        if constexpr (has_identity)
+        {
+          const T id = ::cuda::identity_element<ReductionOp, T>();
+          val        = (lane_id < num_warps) ? temp_storage.warp_aggregates[lane_id] : id;
+        }
+        else
+        {
+          val = (lane_id < num_warps) ? temp_storage.warp_aggregates[lane_id] : T{};
+        }
+
+        // When we have an identity element, every lane in the logical warp holds a valid value
+        // (real or identity), so we can take the all-lanes-valid fast path. Otherwise, fall back
+        // to the partial-valid form which uses num_warps as the last lane.
+        constexpr bool all_lanes_valid = has_identity || (FullTile && (warps == logical_lanes));
+        NullType dummy_storage;
+        warp_aggregate =
+          WarpReduceShfl<T, logical_lanes>(dummy_storage).template Reduce<all_lanes_valid>(val, num_warps, reduction_op);
+      }
+      return warp_aggregate;
+    }
   }
 
   //! @rst
@@ -201,14 +218,7 @@ struct BlockReduceWarpReductions
                          .template Reduce<(FullTile && even_warp_multiple)>(input, warp_num_valid, reduction_op);
 
     // Update outputs and block_aggregate with warp-wide aggregates from lane-0s
-    if constexpr (IsDeterministic)
-    {
-      return ApplyWarpAggregates<FullTile>(reduction_op, warp_aggregate, num_valid);
-    }
-    else
-    {
-      return ApplyWarpAggregatesNonDeterministic(reduction_op, warp_aggregate);
-    }
+    return ApplyWarpAggregates<FullTile>(reduction_op, warp_aggregate, num_valid);
   }
 
   //! @rst
@@ -245,14 +255,7 @@ struct BlockReduceWarpReductions
                                .template Reduce<(FullTile && even_warp_multiple)>(input, warp_num_valid, reduction_op);
 
     // Update outputs and block_aggregate with warp-wide aggregates from lane-0s
-    if constexpr (IsDeterministic)
-    {
-      return ApplyWarpAggregates<FullTile>(reduction_op, warp_aggregate, num_valid);
-    }
-    else
-    {
-      return ApplyWarpAggregatesNonDeterministic(reduction_op, warp_aggregate);
-    }
+    return ApplyWarpAggregates<FullTile>(reduction_op, warp_aggregate, num_valid);
   }
 };
 } // namespace detail