Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion c/parallel/include/cccl/c/segmented_reduce.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,11 +72,12 @@ CCCL_C_API CUresult cccl_device_segmented_reduce(
size_t* temp_storage_bytes,
cccl_iterator_t d_in,
cccl_iterator_t d_out,
uint64_t num_offsets,
uint64_t num_segments,
cccl_iterator_t start_offset_in,
cccl_iterator_t end_offset_in,
cccl_op_t op,
cccl_value_t init,
size_t max_segment_size,
Comment thread
NaderAlAwar marked this conversation as resolved.
CUstream stream);

CCCL_C_API CUresult cccl_device_segmented_reduce_cleanup(cccl_device_segmented_reduce_build_result_t* bld_ptr);
Expand Down
5 changes: 3 additions & 2 deletions c/parallel/src/segmented_reduce.cu
Original file line number Diff line number Diff line change
Expand Up @@ -295,6 +295,7 @@ CUresult cccl_device_segmented_reduce(
cccl_iterator_t end_offset,
cccl_op_t op,
cccl_value_t init,
size_t max_segment_size,
CUstream stream)
{
bool pushed = false;
Expand All @@ -316,7 +317,7 @@ CUresult cccl_device_segmented_reduce(
indirect_iterator_t{end_offset},
indirect_arg_t{op},
indirect_arg_t{init},
/* max_segment_size */ size_t{0},
max_segment_size,
stream,
*static_cast<cub::detail::segmented_reduce::policy_selector*>(build.runtime_policy),
segmented_reduce::segmented_reduce_kernel_source{build},
Expand All @@ -327,7 +328,7 @@ CUresult cccl_device_segmented_reduce(
catch (const std::exception& exc)
{
fflush(stderr);
printf("\nEXCEPTION in cccl_device_reduce(): %s\n", exc.what());
printf("\nEXCEPTION in cccl_device_segmented_reduce(): %s\n", exc.what());
fflush(stdout);
error = CUDA_ERROR_UNKNOWN;
}
Expand Down
205 changes: 203 additions & 2 deletions c/parallel/test/test_segmented_reduce.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,9 +77,65 @@ struct segmented_reduce_build
struct segmented_reduce_run
{
template <typename... Ts>
CUresult operator()(Ts... args) const noexcept
CUresult operator()(
cccl_device_segmented_reduce_build_result_t build,
void* d_temp_storage,
size_t* temp_storage_bytes,
cccl_iterator_t d_in,
cccl_iterator_t d_out,
uint64_t num_segments,
cccl_iterator_t start_offset,
cccl_iterator_t end_offset,
cccl_op_t op,
cccl_value_t init,
CUstream stream) const noexcept
{
return cccl_device_segmented_reduce(
build,
d_temp_storage,
temp_storage_bytes,
d_in,
d_out,
num_segments,
start_offset,
end_offset,
op,
init,
0,
stream);
}
};

// Variant that passes a compile-time guaranteed_max_segment_size to exercise different dispatch policies
template <size_t GuaranteedMaxSegmentSize>
struct segmented_reduce_run_guaranteed
{
CUresult operator()(
cccl_device_segmented_reduce_build_result_t build,
void* d_temp_storage,
size_t* temp_storage_bytes,
cccl_iterator_t d_in,
cccl_iterator_t d_out,
uint64_t num_segments,
cccl_iterator_t start_offset,
cccl_iterator_t end_offset,
cccl_op_t op,
cccl_value_t init,
CUstream stream) const noexcept
{
return cccl_device_segmented_reduce(args...);
return cccl_device_segmented_reduce(
build,
d_temp_storage,
temp_storage_bytes,
d_in,
d_out,
num_segments,
start_offset,
end_offset,
op,
init,
GuaranteedMaxSegmentSize,
stream);
}
};

Expand All @@ -99,6 +155,28 @@ void segmented_reduce(
cache, lookup_key, input, output, num_segments, start_offsets, end_offsets, op, init);
}

template <size_t GuaranteedMaxSegmentSize,
typename BuildCache = segmented_reduce_build_cache_t,
typename KeyT = std::string>
void segmented_reduce_guaranteed(
cccl_iterator_t input,
cccl_iterator_t output,
uint64_t num_segments,
cccl_iterator_t start_offsets,
cccl_iterator_t end_offsets,
cccl_op_t op,
cccl_value_t init,
std::optional<BuildCache>& cache,
const std::optional<KeyT>& lookup_key)
{
AlgorithmExecute<BuildResultT,
segmented_reduce_build,
segmented_reduce_cleanup,
segmented_reduce_run_guaranteed<GuaranteedMaxSegmentSize>,
BuildCache,
KeyT>(cache, lookup_key, input, output, num_segments, start_offsets, end_offsets, op, init);
}

// ==============
// Test section
// ==============
Expand Down Expand Up @@ -976,3 +1054,126 @@ extern "C" __device__ void {0}(const void *x1_p, const void *x2_p, void *out_p)

REQUIRE(expected_value == std::vector<CmpT>(as_expected)[0]);
}

// ==============
// guaranteed_max_segment_size tests
// These exercise the small / medium / large dispatch policies in CUB segmented reduce.
// Segment sizes are fixed so the guarantee exactly matches, verifying correctness across policies.
// ==============

// Helper shared by all three tests: builds offset iterators for uniform-size segments
// and calls segmented_reduce_guaranteed, then verifies against std::reduce.
template <size_t GuaranteedMaxSegmentSize, typename TestType, typename BuildCache>
void run_guaranteed_max_seg_size_test(
std::size_t n_rows,
std::size_t n_cols,
std::optional<BuildCache>& build_cache,
const std::optional<std::string>& test_key)
{
const std::size_t n_elems = n_rows * n_cols;
const std::size_t segment_size = n_cols;

const std::vector<TestType> host_input = generate<TestType>(n_elems);
std::vector<TestType> host_output(n_rows, 0);

pointer_t<TestType> input_ptr(host_input);
pointer_t<TestType> output_ptr(host_output);

using SizeT = unsigned long long;
static constexpr std::string_view index_ty_name = "unsigned long long";

struct row_offset_iterator_state_t
{
SizeT linear_id;
SizeT segment_size;
};

static constexpr std::string_view offset_iterator_state_name = "row_offset_iterator_state_t";
static constexpr std::string_view advance_offset_method_name = "advance_offset_it";
static constexpr std::string_view deref_offset_method_name = "dereference_offset_it";

const auto& [offset_iterator_state_src, offset_iterator_advance_src, offset_iterator_deref_src] =
make_step_counting_iterator_sources(
index_ty_name, offset_iterator_state_name, advance_offset_method_name, deref_offset_method_name);

iterator_t<SizeT, row_offset_iterator_state_t> start_offset_it = make_iterator<SizeT, row_offset_iterator_state_t>(
{offset_iterator_state_name, offset_iterator_state_src},
{advance_offset_method_name, offset_iterator_advance_src},
{deref_offset_method_name, offset_iterator_deref_src});

start_offset_it.state.linear_id = 0;
start_offset_it.state.segment_size = segment_size;

iterator_t<SizeT, row_offset_iterator_state_t> end_offset_it = make_iterator<SizeT, row_offset_iterator_state_t>(
{offset_iterator_state_name, ""}, {advance_offset_method_name, ""}, {deref_offset_method_name, ""});

end_offset_it.state.linear_id = 1;
end_offset_it.state.segment_size = segment_size;

operation_t op = make_operation("op", get_reduce_op(get_type_info<TestType>().type));
value_t<TestType> init{0};

segmented_reduce_guaranteed<GuaranteedMaxSegmentSize>(
input_ptr, output_ptr, n_rows, start_offset_it, end_offset_it, op, init, build_cache, test_key);

for (std::size_t i = 0; i < n_rows; ++i)
{
std::size_t row_offset = i * segment_size;
host_output[i] = std::reduce(host_input.begin() + row_offset, host_input.begin() + row_offset + n_cols);
}
REQUIRE(host_output == std::vector<TestType>(output_ptr));
}

// Small segments (≤16 elements): exercises the small warp-level dispatch policy
struct SegmentedReduce_GuaranteedMaxSegSize_Small_Fixture_Tag;
C2H_TEST_LIST("segmented_reduce respects guaranteed_max_segment_size for small segments",
"[segmented_reduce][guaranteed_max_segment_size]",
std::int32_t,
std::int64_t,
std::uint32_t,
std::uint64_t)
{
static constexpr std::size_t segment_size = 8;
const std::size_t n_rows = GENERATE(0, 13, take(2, random(100, 200)));

auto& build_cache = get_cache<SegmentedReduce_GuaranteedMaxSegSize_Small_Fixture_Tag>();
const auto& test_key = make_key<TestType>();

run_guaranteed_max_seg_size_test<segment_size, TestType>(n_rows, segment_size, build_cache, test_key);
}

// Medium segments (≤256 elements): exercises the medium warp-level dispatch policy
struct SegmentedReduce_GuaranteedMaxSegSize_Medium_Fixture_Tag;
C2H_TEST_LIST("segmented_reduce respects guaranteed_max_segment_size for medium segments",
"[segmented_reduce][guaranteed_max_segment_size]",
std::int32_t,
std::int64_t,
std::uint32_t,
std::uint64_t)
{
static constexpr std::size_t segment_size = 64;
const std::size_t n_rows = GENERATE(0, 13, take(2, random(50, 100)));

auto& build_cache = get_cache<SegmentedReduce_GuaranteedMaxSegSize_Medium_Fixture_Tag>();
const auto& test_key = make_key<TestType>();

run_guaranteed_max_seg_size_test<segment_size, TestType>(n_rows, segment_size, build_cache, test_key);
}

// Large segments (≥512 elements): exercises the large block-level dispatch policy
struct SegmentedReduce_GuaranteedMaxSegSize_Large_Fixture_Tag;
C2H_TEST_LIST("segmented_reduce respects guaranteed_max_segment_size for large segments",
"[segmented_reduce][guaranteed_max_segment_size]",
std::int32_t,
std::int64_t,
std::uint32_t,
std::uint64_t)
{
static constexpr std::size_t segment_size = 1024;
const std::size_t n_rows = GENERATE(0, 5, take(2, random(10, 20)));

auto& build_cache = get_cache<SegmentedReduce_GuaranteedMaxSegSize_Large_Fixture_Tag>();
const auto& test_key = make_key<TestType>();

run_guaranteed_max_seg_size_test<segment_size, TestType>(n_rows, segment_size, build_cache, test_key);
}
7 changes: 7 additions & 0 deletions python/cuda_cccl/benchmarks/compute/quick_configs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,13 @@ segmented_sort/keys:
"Elements{io}": 22
"MaxSegmentSize": 10

segmented_reduce/variable_sum:
benchmarks:
variable_default:
"T{ct}": "I32"
"Elements{io}": 16
"MaxSegmentSize": 4

partition/three_way:
"T{ct}": "I32"
"Elements{io}": 16
Expand Down
18 changes: 16 additions & 2 deletions python/cuda_cccl/benchmarks/compute/run_benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,13 +74,27 @@
"merge_sort/keys",
"merge_sort/pairs",
"segmented_sort/keys",
"segmented_reduce/variable_sum",
"partition/three_way",
]

# Axes that use power-of-two values (need [pow2] suffix for nvbench)
# These are the base names (without {ct}/{io} suffixes)
POW2_AXES_CPP = {"Elements", "MaxSegSize", "MaxSegmentSize", "SegmentSize", "Segments"}
POW2_AXES_PY = {"Elements", "MaxSegSize", "MaxSegmentSize", "Segments"}
POW2_AXES_CPP = {
"Elements",
"GuaranteedMaxSegSize",
"MaxSegSize",
"MaxSegmentSize",
"SegmentSize",
"Segments",
}
POW2_AXES_PY = {
"Elements",
"GuaranteedMaxSegSize",
"MaxSegSize",
"MaxSegmentSize",
"Segments",
}

# Axis name mappings from C++ to Python.
# Keep type axes in their C++ form (`{ct}`) to match benchmark axis names.
Expand Down
Loading
Loading