Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions ci/bench.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ benchmarks:
# Examples:
# - '^cub\.bench\.for_each\.base'
# - '^cub\.bench\.reduce\.(sum|min)\.'
- '^cub\.bench\.reduce\.block_reduce_warp_reductions_sum\.'
- '^cub\.bench\.reduce\.warp_reduce_(sum|min)\.'
Comment thread
coderabbitai[bot] marked this conversation as resolved.

# Python benchmark filters (regex matched against paths under benchmarks/).
python:
Expand All @@ -44,11 +46,11 @@ benchmarks:
gpus:
# - "t4" # sm_75, 16 GB
# - "rtx2080" # sm_75, 8 GB
# - "rtxa6000" # sm_86, 48 GB
- "rtxa6000" # sm_86, 48 GB
# - "l4" # sm_89, 24 GB
# - "rtx4090" # sm_89, 24 GB
# - "h100" # sm_90, 80 GB
# - "rtxpro6000" # sm_120
- "h100" # sm_90, 80 GB
- "rtxpro6000" # sm_120

# Extra .devcontainer/launch.sh -d args
# launch_args: "--cuda 13.2 --host gcc14"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
// SPDX-FileCopyrightText: Copyright (c) 2011-2026, NVIDIA CORPORATION. All rights reserved.
// SPDX-License-Identifier: BSD-3

#pragma once

#include <cub/config.cuh>

#include <cub/block/block_reduce.cuh>

#include <cuda_runtime_api.h>
#include <device_side_benchmark.cuh>
#include <nvbench_helper.cuh>

template <int BlockThreads>
struct benchmark_op_t
{
template <typename T>
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion | 🟠 Major | ⚡ Quick win

suggestion: Line 17 and Line 35 use a single-letter template parameter (T). Use _Tp-style naming here to satisfy the template-parameter naming rule for this codebase.

As per coding guidelines "Avoid single letter names for template parameters; use _Tp instead of _T".

Also applies to: 35-35

__device__ __forceinline__ T operator()(T thread_data) const
{
Comment on lines +18 to +19
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | ⚡ Quick win

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
# Read-only verification: confirm missing CCCL API annotations / noexcept in this file
rg -nP '^\s*(template\s*<[^>]+>\s*)?(?!.*_CCCL_(HOST_API|DEVICE_API|API)).*\b(operator\(\)|block_reduce_warp_reductions)\b' cub/benchmarks/bench/reduce/block_reduce_warp_reductions_base.cuh
rg -nP '\b(operator\(\)|block_reduce_warp_reductions)\b(?!.*noexcept)' cub/benchmarks/bench/reduce/block_reduce_warp_reductions_base.cuh

Repository: NVIDIA/cccl

Length of output: 391


🏁 Script executed:

#!/bin/bash
set -euo pipefail

sed -n '1,120p' cub/benchmarks/bench/reduce/block_reduce_warp_reductions_base.cuh | nl -ba | sed -n '10,80p'

Repository: NVIDIA/cccl

Length of output: 98


🏁 Script executed:

#!/bin/bash
set -euo pipefail

rg -n "operator\(\)|block_reduce_warp_reductions" cub/benchmarks/bench/reduce/block_reduce_warp_reductions_base.cuh

Repository: NVIDIA/cccl

Length of output: 289


🏁 Script executed:

#!/bin/bash
set -euo pipefail

# Capture the full function signatures/headers for the two targets (best-effort)
perl -0777 -ne '
my $f=$_; 
if($f =~ /(operator\(\)\s*const[^\n]*\n\s*\{)/s){ print "FOUND operator():\n$1\n"; }
if($f =~ /(block_reduce_warp_reductions[^\n]*\n\s*\{)/s){ print "FOUND block_reduce_warp_reductions():\n$1\n"; }
' cub/benchmarks/bench/reduce/block_reduce_warp_reductions_base.cuh

Repository: NVIDIA/cccl

Length of output: 170


🏁 Script executed:

#!/bin/bash
set -euo pipefail

# Show the relevant top portion with line numbers (no nl dependency)
awk '{printf "%5d:%s\n", NR, $0}' cub/benchmarks/bench/reduce/block_reduce_warp_warp_reductions_base.cuh | sed -n '1,110p'

Repository: NVIDIA/cccl

Length of output: 196


🏁 Script executed:

#!/bin/bash
set -euo pipefail

# Check for noexcept / CCCL API tokens near the declarations
rg -n "operator\(\)|block_reduce_warp_reductions" -n cub/benchmarks/bench/reduce/block_reduce_warp_reductions_base.cuh
rg -n --context=3 "operator\(\)|block_reduce_warp_reductions" cub/benchmarks/bench/reduce/block_reduce_warp_reductions_base.cuh
rg -n --context=0 "_CCCL_(HOST_API|DEVICE_API|API)" cub/benchmarks/bench/reduce/block_reduce_warp_reductions_base.cuh

Repository: NVIDIA/cccl

Length of output: 1049


🏁 Script executed:

#!/bin/bash
set -euo pipefail

# Look for exception-throwing in the relevant functions/bodies
rg -n "\bthrow\b" cub/benchmarks/bench/reduce/block_reduce_warp_reductions_base.cuh

Repository: NVIDIA/cccl

Length of output: 37


🏁 Script executed:

#!/bin/bash
set -euo pipefail

FILE="cub/benchmarks/bench/reduce/block_reduce_warp_reductions_base.cuh"
# Print a small, line-numbered window around operator() and block_reduce_warp_reductions
sed -n '10,60p' "$FILE" | awk '{printf "%5d:%s\n", NR+10-1, $0}'

Repository: NVIDIA/cccl

Length of output: 2048


🏁 Script executed:

#!/bin/bash
set -euo pipefail

FILE="cub/benchmarks/bench/reduce/block_reduce_warp_reductions_base.cuh"
# Check for _CCCL_* usage and noexcept in/near the relevant symbols
rg -n "_CCCL_(HOST_API|DEVICE_API|API)" "$FILE"
rg -n "noexcept" "$FILE"
rg -n --context=2 "operator\(\)" "$FILE"
rg -n --context=2 "block_reduce_warp_reductions" "$FILE"

Repository: NVIDIA/cccl

Length of output: 37


🏁 Script executed:

#!/bin/bash
set -euo pipefail

FILE="cub/benchmarks/bench/reduce/block_reduce_warp_reductions_base.cuh"
# Look for exception throwing inside this file
rg -n "\bthrow\b" "$FILE"

Repository: NVIDIA/cccl

Length of output: 37


🏁 Script executed:

#!/bin/bash
set -euo pipefail
FILE="cub/benchmarks/bench/reduce/block_reduce_warp_reductions_base.cuh"

echo "== _CCCL_* tokens =="
rg -n "_CCCL_(HOST_API|DEVICE_API|API)" "$FILE" || true

echo "== noexcept tokens =="
rg -n "noexcept" "$FILE" || true

echo "== throw tokens =="
rg -n "\bthrow\b" "$FILE" || true

Repository: NVIDIA/cccl

Length of output: 117


important: Add required _CCCL_* API annotations and noexcept to benchmark functions.

  • benchmark_op_t::operator() (line 18) lacks _CCCL_DEVICE_API and noexcept.
  • block_reduce_warp_reductions (line 36) lacks _CCCL_HOST_API and noexcept.
Suggested change
-  __device__ __forceinline__ T operator()(T thread_data) const
+  _CCCL_DEVICE_API __forceinline__ auto operator()(_Tp thread_data) const noexcept -> _Tp
-template <typename T>
-void block_reduce_warp_reductions(nvbench::state& state, nvbench::type_list<T>)
+template <typename _Tp>
+_CCCL_HOST_API void block_reduce_warp_reductions(nvbench::state& state, nvbench::type_list<_Tp>) noexcept

using BlockReduce = cub::BlockReduce<T, BlockThreads, cub::BLOCK_REDUCE_WARP_REDUCTIONS>;
using TempStorage = typename BlockReduce::TempStorage;
__shared__ TempStorage temp_storage;
T agg = BlockReduce{temp_storage}.Reduce(thread_data, op_t{});
// Re-broadcast so every thread depends on the reduction result, preventing DCE.
__shared__ T broadcast;
if (threadIdx.x == 0)
{
broadcast = agg;
}
__syncthreads();
return broadcast;
}
};

template <typename T>
void block_reduce_warp_reductions(nvbench::state& state, nvbench::type_list<T>)
{
constexpr int block_size = 256; // 8 warps -> exercises optimized ApplyWarpAggregates
constexpr int unroll_factor = 32; // compromise between compile time and noise
using action_t = benchmark_op_t<block_size>;
const auto& kernel = benchmark_kernel<block_size, unroll_factor, action_t, T>;
const int num_SMs = state.get_device().value().get_number_of_sms();
int max_blocks_per_SM = 0;
NVBENCH_CUDA_CALL_NOEXCEPT(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_SM, kernel, block_size, 0));
const int grid_size = max_blocks_per_SM * num_SMs;
if (grid_size == 0)
{
state.skip("Kernel occupancy is zero for this type/configuration.");
return;
}
state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch&) {
kernel<<<grid_size, block_size>>>(action_t{});
});
Comment thread
coderabbitai[bot] marked this conversation as resolved.
}

NVBENCH_BENCH_TYPES(block_reduce_warp_reductions, NVBENCH_TYPE_AXES(value_types))
.set_name("base")
.set_type_axes_names({"T{ct}"});
35 changes: 35 additions & 0 deletions cub/benchmarks/bench/reduce/block_reduce_warp_reductions_sum.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
// SPDX-FileCopyrightText: Copyright (c) 2011-2026, NVIDIA CORPORATION. All rights reserved.
// SPDX-License-Identifier: BSD-3

#include <nvbench_helper.cuh>

using value_types = nvbench::type_list<
int8_t,
int16_t,
int32_t,
int64_t,
#if _CCCL_HAS_INT128()
int128_t,
#endif
#if _CCCL_HAS_NVFP16() && _CCCL_CTK_AT_LEAST(12, 2)
__half,
#endif
#if _CCCL_HAS_NVBF16() && _CCCL_CTK_AT_LEAST(12, 2)
__nv_bfloat16,
#endif
float,
double,
#if _CCCL_HAS_FLOAT128()
__float128,
#endif
#if _CCCL_HAS_NVFP16() && _CCCL_CTK_AT_LEAST(12, 2)
cuda::std::complex<__half>,
#endif
#if _CCCL_HAS_NVBF16() && _CCCL_CTK_AT_LEAST(12, 2)
cuda::std::complex<__nv_bfloat16>,
#endif
cuda::std::complex<float>,
cuda::std::complex<double>>;

using op_t = ::cuda::std::plus<>;
#include "block_reduce_warp_reductions_base.cuh"
5 changes: 5 additions & 0 deletions cub/benchmarks/bench/reduce/warp_reduce_base.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,11 @@ void warp_reduce(nvbench::state& state, nvbench::type_list<T>)
int max_blocks_per_SM = 0;
NVBENCH_CUDA_CALL_NOEXCEPT(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_SM, kernel, block_size, 0));
const int grid_size = max_blocks_per_SM * num_SMs;
if (grid_size == 0)
{
state.skip("Kernel occupancy is zero for this type/configuration.");
return;
}
state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch&) {
kernel<<<grid_size, block_size>>>(benchmark_op_t{});
});
Expand Down
131 changes: 67 additions & 64 deletions cub/cub/block/specializations/block_reduce_warp_reductions.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,18 @@
#endif // no system header

#include <cub/detail/uninitialized_copy.cuh>
#include <cub/thread/thread_operators.cuh>
#include <cub/util_ptx.cuh>
#include <cub/warp/warp_reduce.cuh>

#include <cuda/__cmath/ceil_div.h>
#include <cuda/__cmath/pow2.h>
#include <cuda/__functional/operator_properties.h>
#include <cuda/__ptx/instructions/get_sreg.h>
#include <cuda/atomic>
#include <cuda/std/__algorithm/min.h>
#include <cuda/std/__bit/integral.h>

#include <nv/target>

CUB_NAMESPACE_BEGIN
namespace detail
Expand Down Expand Up @@ -97,46 +102,10 @@ struct BlockReduceWarpReductions
{}

//! @rst
//! Returns block-wide aggregate in *thread*\ :sub:`0`.
//! @endrst
//!
//! @tparam ReductionOp
//! **[inferred]** Binary reduction operator type
//!
//! @param[in] reduction_op
//! Binary reduction operator
//! Cooperatively reduces warp aggregates.
//!
//! @param[in] warp_aggregate
//! **[**\ *lane*\ :sub:`0` **only]** Warp-wide aggregate reduction of input items
template <typename ReductionOp>
_CCCL_DEVICE _CCCL_FORCEINLINE T ApplyWarpAggregatesNonDeterministic(ReductionOp reduction_op, T warp_aggregate)
{
if (linear_tid == 0)
{
detail::uninitialized_copy_single(temp_storage.warp_aggregates, warp_aggregate);
}

__syncthreads();

// Warp 0 already contributed its aggregate above since its also linear_tid == 0
if (lane_id == 0 && warp_id != 0)
{
// TODO: replace this with other atomic operations when specified
NV_IF_ELSE_TARGET(
NV_PROVIDES_SM_60,
({
::cuda::atomic_ref<T, ::cuda::thread_scope_block> atomic_target(temp_storage.warp_aggregates[0]);
atomic_target.fetch_add(warp_aggregate, ::cuda::memory_order_relaxed);
}),
(atomicAdd(&temp_storage.warp_aggregates[0], warp_aggregate);));
}

__syncthreads();
return temp_storage.warp_aggregates[0];
}

//! @rst
//! Recursively applies warp aggregates using template unrolling for deterministic reduction.
//! For small blocks (few warps) the reduction is performed sequentially by ``linear_tid == 0``.
//! For larger blocks, warp 0 reduces the warp aggregates in parallel:
//! @endrst
//!
//! @tparam FullTile
Expand All @@ -155,21 +124,69 @@ struct BlockReduceWarpReductions

__syncthreads();

// Update total aggregate in warp 0, lane 0
if (linear_tid == 0)
// Below this number of warps the parallel warp-0 reduction is not worthwhile compared to a
// single-thread sequential loop over the warp aggregates.
constexpr int small_block_warp_threshold = 8;

if constexpr (warps < small_block_warp_threshold)
{
_CCCL_PRAGMA_UNROLL_FULL()
for (int warp_idx = 1; warp_idx < warps; ++warp_idx)
// Sequential reduction in linear_tid == 0 (legacy path for small blocks).
if (linear_tid == 0)
{
if (FullTile || (warp_idx * logical_warp_size < num_valid))
_CCCL_PRAGMA_UNROLL_FULL()
for (int warp_idx = 1; warp_idx < warps; ++warp_idx)
{
T addend = temp_storage.warp_aggregates[warp_idx];
warp_aggregate = reduction_op(warp_aggregate, addend);
if (FullTile || (warp_idx * logical_warp_size < num_valid))
{
warp_aggregate = reduction_op(warp_aggregate, temp_storage.warp_aggregates[warp_idx]);
}
}
}
return warp_aggregate;
}
else
{
// Parallel reduction in warp 0.
if (warp_id == 0)
{
const int num_warps = FullTile ? int(warps) : static_cast<int>(::cuda::ceil_div(num_valid, logical_warp_size));

if constexpr (is_redux_enabled_cuda_operator<ReductionOp, T>)
{
static_assert(::cuda::has_identity_element_v<ReductionOp, T>,
"REDUX-eligible operators must have an identity element");
NV_IF_TARGET(NV_PROVIDES_SM_80, ({
const T id = ::cuda::identity_element<ReductionOp, T>();
const T val = (lane_id < num_warps) ? temp_storage.warp_aggregates[lane_id] : id;
return reduce_op_sync(val, 0xFFFFFFFFu, reduction_op);
}))
}

// Shuffle-based tree over bit_ceil(warps) lanes.
constexpr int logical_lanes = static_cast<int>(::cuda::std::bit_ceil(static_cast<unsigned>(warps)));

return warp_aggregate;
T val;
constexpr bool has_identity = ::cuda::has_identity_element_v<ReductionOp, T>;
if constexpr (has_identity)
{
const T id = ::cuda::identity_element<ReductionOp, T>();
val = (lane_id < num_warps) ? temp_storage.warp_aggregates[lane_id] : id;
}
else
{
val = (lane_id < num_warps) ? temp_storage.warp_aggregates[lane_id] : T{};
}

// When we have an identity element, every lane in the logical warp holds a valid value
// (real or identity), so we can take the all-lanes-valid fast path. Otherwise, fall back
// to the partial-valid form which uses num_warps as the last lane.
constexpr bool all_lanes_valid = has_identity || (FullTile && (warps == logical_lanes));
NullType dummy_storage;
warp_aggregate =
WarpReduceShfl<T, logical_lanes>(dummy_storage).template Reduce<all_lanes_valid>(val, num_warps, reduction_op);
}
return warp_aggregate;
}
}

//! @rst
Expand Down Expand Up @@ -201,14 +218,7 @@ struct BlockReduceWarpReductions
.template Reduce<(FullTile && even_warp_multiple)>(input, warp_num_valid, reduction_op);

// Update outputs and block_aggregate with warp-wide aggregates from lane-0s
if constexpr (IsDeterministic)
{
return ApplyWarpAggregates<FullTile>(reduction_op, warp_aggregate, num_valid);
}
else
{
return ApplyWarpAggregatesNonDeterministic(reduction_op, warp_aggregate);
}
return ApplyWarpAggregates<FullTile>(reduction_op, warp_aggregate, num_valid);
}

//! @rst
Expand Down Expand Up @@ -245,14 +255,7 @@ struct BlockReduceWarpReductions
.template Reduce<(FullTile && even_warp_multiple)>(input, warp_num_valid, reduction_op);

// Update outputs and block_aggregate with warp-wide aggregates from lane-0s
if constexpr (IsDeterministic)
{
return ApplyWarpAggregates<FullTile>(reduction_op, warp_aggregate, num_valid);
}
else
{
return ApplyWarpAggregatesNonDeterministic(reduction_op, warp_aggregate);
}
return ApplyWarpAggregates<FullTile>(reduction_op, warp_aggregate, num_valid);
}
};
} // namespace detail
Expand Down