diff --git a/ci/bench.yaml b/ci/bench.yaml index f1fe0c963fc..cdbb4ba7719 100644 --- a/ci/bench.yaml +++ b/ci/bench.yaml @@ -32,6 +32,8 @@ benchmarks: # Examples: # - '^cub\.bench\.for_each\.base' # - '^cub\.bench\.reduce\.(sum|min)\.' + - '^cub\.bench\.reduce\.block_reduce_warp_reductions_sum\.' + - '^cub\.bench\.reduce\.warp_reduce_(sum|min)\.' # Python benchmark filters (regex matched against paths under benchmarks/). python: @@ -44,11 +46,11 @@ benchmarks: gpus: # - "t4" # sm_75, 16 GB # - "rtx2080" # sm_75, 8 GB - # - "rtxa6000" # sm_86, 48 GB + - "rtxa6000" # sm_86, 48 GB # - "l4" # sm_89, 24 GB # - "rtx4090" # sm_89, 24 GB - # - "h100" # sm_90, 80 GB - # - "rtxpro6000" # sm_120 + - "h100" # sm_90, 80 GB + - "rtxpro6000" # sm_120 # Extra .devcontainer/launch.sh -d args # launch_args: "--cuda 13.2 --host gcc14" diff --git a/cub/benchmarks/bench/reduce/block_reduce_warp_reductions_base.cuh b/cub/benchmarks/bench/reduce/block_reduce_warp_reductions_base.cuh new file mode 100644 index 00000000000..21bd4a9eb51 --- /dev/null +++ b/cub/benchmarks/bench/reduce/block_reduce_warp_reductions_base.cuh @@ -0,0 +1,58 @@ +// SPDX-FileCopyrightText: Copyright (c) 2011-2026, NVIDIA CORPORATION. All rights reserved. +// SPDX-License-Identifier: BSD-3 + +#pragma once + +#include + +#include + +#include +#include +#include + +template +struct benchmark_op_t +{ + template + __device__ __forceinline__ T operator()(T thread_data) const + { + using BlockReduce = cub::BlockReduce; + using TempStorage = typename BlockReduce::TempStorage; + __shared__ TempStorage temp_storage; + T agg = BlockReduce{temp_storage}.Reduce(thread_data, op_t{}); + // Re-broadcast so every thread depends on the reduction result, preventing DCE. + __shared__ T broadcast; + if (threadIdx.x == 0) + { + broadcast = agg; + } + __syncthreads(); + return broadcast; + } +}; + +template +void block_reduce_warp_reductions(nvbench::state& state, nvbench::type_list) +{ + constexpr int block_size = 256; // 8 warps -> exercises optimized ApplyWarpAggregates + constexpr int unroll_factor = 32; // compromise between compile time and noise + using action_t = benchmark_op_t; + const auto& kernel = benchmark_kernel; + const int num_SMs = state.get_device().value().get_number_of_sms(); + int max_blocks_per_SM = 0; + NVBENCH_CUDA_CALL_NOEXCEPT(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_SM, kernel, block_size, 0)); + const int grid_size = max_blocks_per_SM * num_SMs; + if (grid_size == 0) + { + state.skip("Kernel occupancy is zero for this type/configuration."); + return; + } + state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch&) { + kernel<<>>(action_t{}); + }); +} + +NVBENCH_BENCH_TYPES(block_reduce_warp_reductions, NVBENCH_TYPE_AXES(value_types)) + .set_name("base") + .set_type_axes_names({"T{ct}"}); diff --git a/cub/benchmarks/bench/reduce/block_reduce_warp_reductions_sum.cu b/cub/benchmarks/bench/reduce/block_reduce_warp_reductions_sum.cu new file mode 100644 index 00000000000..63f31ef8f93 --- /dev/null +++ b/cub/benchmarks/bench/reduce/block_reduce_warp_reductions_sum.cu @@ -0,0 +1,35 @@ +// SPDX-FileCopyrightText: Copyright (c) 2011-2026, NVIDIA CORPORATION. All rights reserved. +// SPDX-License-Identifier: BSD-3 + +#include + +using value_types = nvbench::type_list< + int8_t, + int16_t, + int32_t, + int64_t, +#if _CCCL_HAS_INT128() + int128_t, +#endif +#if _CCCL_HAS_NVFP16() && _CCCL_CTK_AT_LEAST(12, 2) + __half, +#endif +#if _CCCL_HAS_NVBF16() && _CCCL_CTK_AT_LEAST(12, 2) + __nv_bfloat16, +#endif + float, + double, +#if _CCCL_HAS_FLOAT128() + __float128, +#endif +#if _CCCL_HAS_NVFP16() && _CCCL_CTK_AT_LEAST(12, 2) + cuda::std::complex<__half>, +#endif +#if _CCCL_HAS_NVBF16() && _CCCL_CTK_AT_LEAST(12, 2) + cuda::std::complex<__nv_bfloat16>, +#endif + cuda::std::complex, + cuda::std::complex>; + +using op_t = ::cuda::std::plus<>; +#include "block_reduce_warp_reductions_base.cuh" diff --git a/cub/benchmarks/bench/reduce/warp_reduce_base.cuh b/cub/benchmarks/bench/reduce/warp_reduce_base.cuh index c407bc2be35..0cfb5c7098e 100644 --- a/cub/benchmarks/bench/reduce/warp_reduce_base.cuh +++ b/cub/benchmarks/bench/reduce/warp_reduce_base.cuh @@ -33,6 +33,11 @@ void warp_reduce(nvbench::state& state, nvbench::type_list) int max_blocks_per_SM = 0; NVBENCH_CUDA_CALL_NOEXCEPT(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_SM, kernel, block_size, 0)); const int grid_size = max_blocks_per_SM * num_SMs; + if (grid_size == 0) + { + state.skip("Kernel occupancy is zero for this type/configuration."); + return; + } state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch&) { kernel<<>>(benchmark_op_t{}); }); diff --git a/cub/cub/block/specializations/block_reduce_warp_reductions.cuh b/cub/cub/block/specializations/block_reduce_warp_reductions.cuh index 7fcd207939d..a7cb4231b63 100644 --- a/cub/cub/block/specializations/block_reduce_warp_reductions.cuh +++ b/cub/cub/block/specializations/block_reduce_warp_reductions.cuh @@ -21,13 +21,18 @@ #endif // no system header #include +#include #include #include #include +#include +#include #include -#include #include +#include + +#include CUB_NAMESPACE_BEGIN namespace detail @@ -97,46 +102,10 @@ struct BlockReduceWarpReductions {} //! @rst - //! Returns block-wide aggregate in *thread*\ :sub:`0`. - //! @endrst - //! - //! @tparam ReductionOp - //! **[inferred]** Binary reduction operator type - //! - //! @param[in] reduction_op - //! Binary reduction operator + //! Cooperatively reduces warp aggregates. //! - //! @param[in] warp_aggregate - //! **[**\ *lane*\ :sub:`0` **only]** Warp-wide aggregate reduction of input items - template - _CCCL_DEVICE _CCCL_FORCEINLINE T ApplyWarpAggregatesNonDeterministic(ReductionOp reduction_op, T warp_aggregate) - { - if (linear_tid == 0) - { - detail::uninitialized_copy_single(temp_storage.warp_aggregates, warp_aggregate); - } - - __syncthreads(); - - // Warp 0 already contributed its aggregate above since its also linear_tid == 0 - if (lane_id == 0 && warp_id != 0) - { - // TODO: replace this with other atomic operations when specified - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_60, - ({ - ::cuda::atomic_ref atomic_target(temp_storage.warp_aggregates[0]); - atomic_target.fetch_add(warp_aggregate, ::cuda::memory_order_relaxed); - }), - (atomicAdd(&temp_storage.warp_aggregates[0], warp_aggregate);)); - } - - __syncthreads(); - return temp_storage.warp_aggregates[0]; - } - - //! @rst - //! Recursively applies warp aggregates using template unrolling for deterministic reduction. + //! For small blocks (few warps) the reduction is performed sequentially by ``linear_tid == 0``. + //! For larger blocks, warp 0 reduces the warp aggregates in parallel: //! @endrst //! //! @tparam FullTile @@ -155,21 +124,69 @@ struct BlockReduceWarpReductions __syncthreads(); - // Update total aggregate in warp 0, lane 0 - if (linear_tid == 0) + // Below this number of warps the parallel warp-0 reduction is not worthwhile compared to a + // single-thread sequential loop over the warp aggregates. + constexpr int small_block_warp_threshold = 8; + + if constexpr (warps < small_block_warp_threshold) { - _CCCL_PRAGMA_UNROLL_FULL() - for (int warp_idx = 1; warp_idx < warps; ++warp_idx) + // Sequential reduction in linear_tid == 0 (legacy path for small blocks). + if (linear_tid == 0) { - if (FullTile || (warp_idx * logical_warp_size < num_valid)) + _CCCL_PRAGMA_UNROLL_FULL() + for (int warp_idx = 1; warp_idx < warps; ++warp_idx) { - T addend = temp_storage.warp_aggregates[warp_idx]; - warp_aggregate = reduction_op(warp_aggregate, addend); + if (FullTile || (warp_idx * logical_warp_size < num_valid)) + { + warp_aggregate = reduction_op(warp_aggregate, temp_storage.warp_aggregates[warp_idx]); + } } } + return warp_aggregate; } + else + { + // Parallel reduction in warp 0. + if (warp_id == 0) + { + const int num_warps = FullTile ? int(warps) : static_cast(::cuda::ceil_div(num_valid, logical_warp_size)); + + if constexpr (is_redux_enabled_cuda_operator) + { + static_assert(::cuda::has_identity_element_v, + "REDUX-eligible operators must have an identity element"); + NV_IF_TARGET(NV_PROVIDES_SM_80, ({ + const T id = ::cuda::identity_element(); + const T val = (lane_id < num_warps) ? temp_storage.warp_aggregates[lane_id] : id; + return reduce_op_sync(val, 0xFFFFFFFFu, reduction_op); + })) + } + + // Shuffle-based tree over bit_ceil(warps) lanes. + constexpr int logical_lanes = static_cast(::cuda::std::bit_ceil(static_cast(warps))); - return warp_aggregate; + T val; + constexpr bool has_identity = ::cuda::has_identity_element_v; + if constexpr (has_identity) + { + const T id = ::cuda::identity_element(); + val = (lane_id < num_warps) ? temp_storage.warp_aggregates[lane_id] : id; + } + else + { + val = (lane_id < num_warps) ? temp_storage.warp_aggregates[lane_id] : T{}; + } + + // When we have an identity element, every lane in the logical warp holds a valid value + // (real or identity), so we can take the all-lanes-valid fast path. Otherwise, fall back + // to the partial-valid form which uses num_warps as the last lane. + constexpr bool all_lanes_valid = has_identity || (FullTile && (warps == logical_lanes)); + NullType dummy_storage; + warp_aggregate = + WarpReduceShfl(dummy_storage).template Reduce(val, num_warps, reduction_op); + } + return warp_aggregate; + } } //! @rst @@ -201,14 +218,7 @@ struct BlockReduceWarpReductions .template Reduce<(FullTile && even_warp_multiple)>(input, warp_num_valid, reduction_op); // Update outputs and block_aggregate with warp-wide aggregates from lane-0s - if constexpr (IsDeterministic) - { - return ApplyWarpAggregates(reduction_op, warp_aggregate, num_valid); - } - else - { - return ApplyWarpAggregatesNonDeterministic(reduction_op, warp_aggregate); - } + return ApplyWarpAggregates(reduction_op, warp_aggregate, num_valid); } //! @rst @@ -245,14 +255,7 @@ struct BlockReduceWarpReductions .template Reduce<(FullTile && even_warp_multiple)>(input, warp_num_valid, reduction_op); // Update outputs and block_aggregate with warp-wide aggregates from lane-0s - if constexpr (IsDeterministic) - { - return ApplyWarpAggregates(reduction_op, warp_aggregate, num_valid); - } - else - { - return ApplyWarpAggregatesNonDeterministic(reduction_op, warp_aggregate); - } + return ApplyWarpAggregates(reduction_op, warp_aggregate, num_valid); } }; } // namespace detail