-
Notifications
You must be signed in to change notification settings - Fork 394
optimize blockReduceWarp #9101
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
optimize blockReduceWarp #9101
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,58 @@ | ||
| // SPDX-FileCopyrightText: Copyright (c) 2011-2026, NVIDIA CORPORATION. All rights reserved. | ||
| // SPDX-License-Identifier: BSD-3 | ||
|
|
||
| #pragma once | ||
|
|
||
| #include <cub/config.cuh> | ||
|
|
||
| #include <cub/block/block_reduce.cuh> | ||
|
|
||
| #include <cuda_runtime_api.h> | ||
| #include <device_side_benchmark.cuh> | ||
| #include <nvbench_helper.cuh> | ||
|
|
||
| template <int BlockThreads> | ||
| struct benchmark_op_t | ||
| { | ||
| template <typename T> | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🛠️ Refactor suggestion | 🟠 Major | ⚡ Quick win suggestion: Line 17 and Line 35 use a single-letter template parameter ( As per coding guidelines "Avoid single letter names for template parameters; use Also applies to: 35-35 |
||
| __device__ __forceinline__ T operator()(T thread_data) const | ||
| { | ||
|
Comment on lines
+18
to
+19
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧩 Analysis chain🏁 Script executed: #!/bin/bash
# Read-only verification: confirm missing CCCL API annotations / noexcept in this file
rg -nP '^\s*(template\s*<[^>]+>\s*)?(?!.*_CCCL_(HOST_API|DEVICE_API|API)).*\b(operator\(\)|block_reduce_warp_reductions)\b' cub/benchmarks/bench/reduce/block_reduce_warp_reductions_base.cuh
rg -nP '\b(operator\(\)|block_reduce_warp_reductions)\b(?!.*noexcept)' cub/benchmarks/bench/reduce/block_reduce_warp_reductions_base.cuhRepository: NVIDIA/cccl Length of output: 391 🏁 Script executed: #!/bin/bash
set -euo pipefail
sed -n '1,120p' cub/benchmarks/bench/reduce/block_reduce_warp_reductions_base.cuh | nl -ba | sed -n '10,80p'Repository: NVIDIA/cccl Length of output: 98 🏁 Script executed: #!/bin/bash
set -euo pipefail
rg -n "operator\(\)|block_reduce_warp_reductions" cub/benchmarks/bench/reduce/block_reduce_warp_reductions_base.cuhRepository: NVIDIA/cccl Length of output: 289 🏁 Script executed: #!/bin/bash
set -euo pipefail
# Capture the full function signatures/headers for the two targets (best-effort)
perl -0777 -ne '
my $f=$_;
if($f =~ /(operator\(\)\s*const[^\n]*\n\s*\{)/s){ print "FOUND operator():\n$1\n"; }
if($f =~ /(block_reduce_warp_reductions[^\n]*\n\s*\{)/s){ print "FOUND block_reduce_warp_reductions():\n$1\n"; }
' cub/benchmarks/bench/reduce/block_reduce_warp_reductions_base.cuhRepository: NVIDIA/cccl Length of output: 170 🏁 Script executed: #!/bin/bash
set -euo pipefail
# Show the relevant top portion with line numbers (no nl dependency)
awk '{printf "%5d:%s\n", NR, $0}' cub/benchmarks/bench/reduce/block_reduce_warp_warp_reductions_base.cuh | sed -n '1,110p'Repository: NVIDIA/cccl Length of output: 196 🏁 Script executed: #!/bin/bash
set -euo pipefail
# Check for noexcept / CCCL API tokens near the declarations
rg -n "operator\(\)|block_reduce_warp_reductions" -n cub/benchmarks/bench/reduce/block_reduce_warp_reductions_base.cuh
rg -n --context=3 "operator\(\)|block_reduce_warp_reductions" cub/benchmarks/bench/reduce/block_reduce_warp_reductions_base.cuh
rg -n --context=0 "_CCCL_(HOST_API|DEVICE_API|API)" cub/benchmarks/bench/reduce/block_reduce_warp_reductions_base.cuhRepository: NVIDIA/cccl Length of output: 1049 🏁 Script executed: #!/bin/bash
set -euo pipefail
# Look for exception-throwing in the relevant functions/bodies
rg -n "\bthrow\b" cub/benchmarks/bench/reduce/block_reduce_warp_reductions_base.cuhRepository: NVIDIA/cccl Length of output: 37 🏁 Script executed: #!/bin/bash
set -euo pipefail
FILE="cub/benchmarks/bench/reduce/block_reduce_warp_reductions_base.cuh"
# Print a small, line-numbered window around operator() and block_reduce_warp_reductions
sed -n '10,60p' "$FILE" | awk '{printf "%5d:%s\n", NR+10-1, $0}'Repository: NVIDIA/cccl Length of output: 2048 🏁 Script executed: #!/bin/bash
set -euo pipefail
FILE="cub/benchmarks/bench/reduce/block_reduce_warp_reductions_base.cuh"
# Check for _CCCL_* usage and noexcept in/near the relevant symbols
rg -n "_CCCL_(HOST_API|DEVICE_API|API)" "$FILE"
rg -n "noexcept" "$FILE"
rg -n --context=2 "operator\(\)" "$FILE"
rg -n --context=2 "block_reduce_warp_reductions" "$FILE"Repository: NVIDIA/cccl Length of output: 37 🏁 Script executed: #!/bin/bash
set -euo pipefail
FILE="cub/benchmarks/bench/reduce/block_reduce_warp_reductions_base.cuh"
# Look for exception throwing inside this file
rg -n "\bthrow\b" "$FILE"Repository: NVIDIA/cccl Length of output: 37 🏁 Script executed: #!/bin/bash
set -euo pipefail
FILE="cub/benchmarks/bench/reduce/block_reduce_warp_reductions_base.cuh"
echo "== _CCCL_* tokens =="
rg -n "_CCCL_(HOST_API|DEVICE_API|API)" "$FILE" || true
echo "== noexcept tokens =="
rg -n "noexcept" "$FILE" || true
echo "== throw tokens =="
rg -n "\bthrow\b" "$FILE" || trueRepository: NVIDIA/cccl Length of output: 117 important: Add required
Suggested change- __device__ __forceinline__ T operator()(T thread_data) const
+ _CCCL_DEVICE_API __forceinline__ auto operator()(_Tp thread_data) const noexcept -> _Tp-template <typename T>
-void block_reduce_warp_reductions(nvbench::state& state, nvbench::type_list<T>)
+template <typename _Tp>
+_CCCL_HOST_API void block_reduce_warp_reductions(nvbench::state& state, nvbench::type_list<_Tp>) noexcept |
||
| using BlockReduce = cub::BlockReduce<T, BlockThreads, cub::BLOCK_REDUCE_WARP_REDUCTIONS>; | ||
| using TempStorage = typename BlockReduce::TempStorage; | ||
| __shared__ TempStorage temp_storage; | ||
| T agg = BlockReduce{temp_storage}.Reduce(thread_data, op_t{}); | ||
| // Re-broadcast so every thread depends on the reduction result, preventing DCE. | ||
| __shared__ T broadcast; | ||
| if (threadIdx.x == 0) | ||
| { | ||
| broadcast = agg; | ||
| } | ||
| __syncthreads(); | ||
| return broadcast; | ||
| } | ||
| }; | ||
|
|
||
| template <typename T> | ||
| void block_reduce_warp_reductions(nvbench::state& state, nvbench::type_list<T>) | ||
| { | ||
| constexpr int block_size = 256; // 8 warps -> exercises optimized ApplyWarpAggregates | ||
| constexpr int unroll_factor = 32; // compromise between compile time and noise | ||
| using action_t = benchmark_op_t<block_size>; | ||
| const auto& kernel = benchmark_kernel<block_size, unroll_factor, action_t, T>; | ||
| const int num_SMs = state.get_device().value().get_number_of_sms(); | ||
| int max_blocks_per_SM = 0; | ||
| NVBENCH_CUDA_CALL_NOEXCEPT(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_SM, kernel, block_size, 0)); | ||
| const int grid_size = max_blocks_per_SM * num_SMs; | ||
| if (grid_size == 0) | ||
| { | ||
| state.skip("Kernel occupancy is zero for this type/configuration."); | ||
| return; | ||
| } | ||
| state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch&) { | ||
| kernel<<<grid_size, block_size>>>(action_t{}); | ||
| }); | ||
|
coderabbitai[bot] marked this conversation as resolved.
|
||
| } | ||
|
|
||
| NVBENCH_BENCH_TYPES(block_reduce_warp_reductions, NVBENCH_TYPE_AXES(value_types)) | ||
| .set_name("base") | ||
| .set_type_axes_names({"T{ct}"}); | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,35 @@ | ||
| // SPDX-FileCopyrightText: Copyright (c) 2011-2026, NVIDIA CORPORATION. All rights reserved. | ||
| // SPDX-License-Identifier: BSD-3 | ||
|
|
||
| #include <nvbench_helper.cuh> | ||
|
|
||
| using value_types = nvbench::type_list< | ||
| int8_t, | ||
| int16_t, | ||
| int32_t, | ||
| int64_t, | ||
| #if _CCCL_HAS_INT128() | ||
| int128_t, | ||
| #endif | ||
| #if _CCCL_HAS_NVFP16() && _CCCL_CTK_AT_LEAST(12, 2) | ||
| __half, | ||
| #endif | ||
| #if _CCCL_HAS_NVBF16() && _CCCL_CTK_AT_LEAST(12, 2) | ||
| __nv_bfloat16, | ||
| #endif | ||
| float, | ||
| double, | ||
| #if _CCCL_HAS_FLOAT128() | ||
| __float128, | ||
| #endif | ||
| #if _CCCL_HAS_NVFP16() && _CCCL_CTK_AT_LEAST(12, 2) | ||
| cuda::std::complex<__half>, | ||
| #endif | ||
| #if _CCCL_HAS_NVBF16() && _CCCL_CTK_AT_LEAST(12, 2) | ||
| cuda::std::complex<__nv_bfloat16>, | ||
| #endif | ||
| cuda::std::complex<float>, | ||
| cuda::std::complex<double>>; | ||
|
|
||
| using op_t = ::cuda::std::plus<>; | ||
| #include "block_reduce_warp_reductions_base.cuh" |
Uh oh!
There was an error while loading. Please reload this page.