diff --git a/fern/ai_examples_override.yml b/fern/ai_examples_override.yml
new file mode 100644
index 0000000..bce79c3
--- /dev/null
+++ b/fern/ai_examples_override.yml
@@ -0,0 +1,13 @@
+paths:
+ /user/username:
+ get:
+ x-fern-examples:
+ - path-parameters:
+ username: username
+ request:
+ body: {}
+ response:
+ body:
+ id: 42
+ username: plantlover99
+ email: plantlover99@example.com
diff --git a/fern/cudapages/cub/cub/cub/AgentAdjacentDifferencePolicy.mdx b/fern/cudapages/cub/cub/cub/AgentAdjacentDifferencePolicy.mdx
new file mode 100644
index 0000000..a8d066d
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/AgentAdjacentDifferencePolicy.mdx
@@ -0,0 +1,38 @@
+---
+title: cub::AgentAdjacentDifferencePolicy
+description: ""
+---
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+
+## Member variables
+
+| Name | Type | Description |
+|---|---|---|
+| `BLOCK_THREADS` static constexpr | `int` | |
+| `ITEMS_PER_THREAD` static constexpr | `int` | |
+| `ITEMS_PER_TILE` static constexpr | `int` | |
+| `LOAD_ALGORITHM` static constexpr | `cub::BlockLoadAlgorithm` | |
+| `LOAD_MODIFIER` static constexpr | `cub::CacheLoadModifier` | |
+| `STORE_ALGORITHM` static constexpr | `cub::BlockStoreAlgorithm` | |
diff --git a/fern/cudapages/cub/cub/cub/AgentHistogramPolicy.mdx b/fern/cudapages/cub/cub/cub/AgentHistogramPolicy.mdx
new file mode 100644
index 0000000..38781a0
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/AgentHistogramPolicy.mdx
@@ -0,0 +1,59 @@
+---
+title: cub::AgentHistogramPolicy
+description: "Parameterizable tuning policy type for AgentHistogram."
+---
+
+Parameterizable tuning policy type for AgentHistogram.
+
+
+
+
+
+Threads per thread block
+
+
+
+Pixels per thread (per tile of input)
+
+
+
+The [BlockLoad](/library/api/cub::_block_load) algorithm to use
+
+
+
+Cache load modifier for reading input elements
+
+
+
+Whether to perform localized RLE to compress samples before histogramming
+
+
+
+Whether to prefer privatized shared-memory bins (versus privatized global-memory bins)
+
+
+
+Whether to dequeue tiles from a global work queue
+
+
+
+Vector size for samples loading (1, 2, 4)
+
+
+
+
+
+---
+
+## Member variables
+
+| Name | Type | Description |
+|---|---|---|
+| `BLOCK_THREADS` static constexpr | `int` | Threads per thread block. |
+| `PIXELS_PER_THREAD` static constexpr | `int` | Pixels per thread (per tile of input). |
+| `IS_RLE_COMPRESS` static constexpr | `bool` | Whether to perform localized RLE to compress samples before histogramming. |
+| `MEM_PREFERENCE` static constexpr | `BlockHistogramMemoryPreference` | Whether to prefer privatized shared-memory bins (versus privatized global-memory bins). |
+| `IS_WORK_STEALING` static constexpr | `bool` | Whether to dequeue tiles from a global work queue. |
+| `VEC_SIZE` static constexpr | `int` | Vector size for samples loading (1, 2, 4). |
+| `LOAD_ALGORITHM` static constexpr | `BlockLoadAlgorithm` | Cache load modifier for reading input elements. |
+| `LOAD_MODIFIER` static constexpr | `CacheLoadModifier` | |
diff --git a/fern/cudapages/cub/cub/cub/AgentMergeSortPolicy.mdx b/fern/cudapages/cub/cub/cub/AgentMergeSortPolicy.mdx
new file mode 100644
index 0000000..ae2d584
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/AgentMergeSortPolicy.mdx
@@ -0,0 +1,38 @@
+---
+title: cub::AgentMergeSortPolicy
+description: ""
+---
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+
+## Member variables
+
+| Name | Type | Description |
+|---|---|---|
+| `BLOCK_THREADS` static constexpr | `int` | |
+| `ITEMS_PER_THREAD` static constexpr | `int` | |
+| `ITEMS_PER_TILE` static constexpr | `int` | |
+| `LOAD_ALGORITHM` static constexpr | `cub::BlockLoadAlgorithm` | |
+| `LOAD_MODIFIER` static constexpr | `cub::CacheLoadModifier` | |
+| `STORE_ALGORITHM` static constexpr | `cub::BlockStoreAlgorithm` | |
diff --git a/fern/cudapages/cub/cub/cub/AgentRadixSortDownsweepPolicy.mdx b/fern/cudapages/cub/cub/cub/AgentRadixSortDownsweepPolicy.mdx
new file mode 100644
index 0000000..c7330ff
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/AgentRadixSortDownsweepPolicy.mdx
@@ -0,0 +1,61 @@
+---
+title: cub::AgentRadixSortDownsweepPolicy
+description: "Parameterizable tuning policy type for AgentRadixSortDownsweep."
+---
+
+Parameterizable tuning policy type for AgentRadixSortDownsweep.
+
+
+
+
+
+Threads per thread block
+
+
+
+Items per thread (per tile of input)
+
+
+
+Dominant compute type
+
+
+
+The [BlockLoad](/library/api/cub::_block_load) algorithm to use
+
+
+
+Cache load modifier for reading keys (and values)
+
+
+
+The radix ranking algorithm to use
+
+
+
+The block scan algorithm to use
+
+
+
+The number of radix bits, i.e., log2(bins)
+
+
+
+
+
+
+
+
+**Inherits from:** `detail::RegBoundScaling< NominalBlockThreads4B, NominalItemsPerThread4B, ComputeT >` (public)
+
+---
+
+## Member variables
+
+| Name | Type | Description |
+|---|---|---|
+| `RADIX_BITS` static constexpr | `int` | The number of radix bits, i.e., log2(bins). |
+| `LOAD_ALGORITHM` static constexpr | `BlockLoadAlgorithm` | The [BlockLoad](/library/api/cub::_block_load) algorithm to use. |
+| `LOAD_MODIFIER` static constexpr | `CacheLoadModifier` | Cache load modifier for reading keys (and values). |
+| `RANK_ALGORITHM` static constexpr | `RadixRankAlgorithm` | The radix ranking algorithm to use. |
+| `SCAN_ALGORITHM` static constexpr | `BlockScanAlgorithm` | The [BlockScan](/library/api/cub::_block_scan) algorithm to use. |
diff --git a/fern/cudapages/cub/cub/cub/AgentRadixSortExclusiveSumPolicy.mdx b/fern/cudapages/cub/cub/cub/AgentRadixSortExclusiveSumPolicy.mdx
new file mode 100644
index 0000000..bc1d0a5
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/AgentRadixSortExclusiveSumPolicy.mdx
@@ -0,0 +1,25 @@
+---
+title: cub::AgentRadixSortExclusiveSumPolicy
+description: ""
+---
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+
+## Member variables
+
+| Name | Type | Description |
+|---|---|---|
+| `BLOCK_THREADS` static constexpr | `int` | |
+| `RADIX_BITS` static constexpr | `int` | |
diff --git a/fern/cudapages/cub/cub/cub/AgentRadixSortHistogramPolicy.mdx b/fern/cudapages/cub/cub/cub/AgentRadixSortHistogramPolicy.mdx
new file mode 100644
index 0000000..4a74491
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/AgentRadixSortHistogramPolicy.mdx
@@ -0,0 +1,50 @@
+---
+title: cub::AgentRadixSortHistogramPolicy
+description: ""
+---
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+If void, use NOMINAL_4B_NUM_PARTS directly for NUM_PARTS. Otherwise, perform scaling.
+
+
+
+
+
+
+
+
+---
+
+## Static methods
+
+### num_parts_helper inline static constexpr
+
+
+```cpp showLineNumbers={false}
+template
+static constexpr int cub::AgentRadixSortHistogramPolicy::num_parts_helper()
+```
+
+
+---
+
+## Member variables
+
+| Name | Type | Description |
+|---|---|---|
+| `BLOCK_THREADS` static constexpr | `int` | |
+| `ITEMS_PER_THREAD` static constexpr | `int` | |
+| `NUM_PARTS` static constexpr | `int` | NUM_PARTS is the number of private histograms (parts) each histogram is split into. |
+| `RADIX_BITS` static constexpr | `int` | |
diff --git a/fern/cudapages/cub/cub/cub/AgentRadixSortOnesweepPolicy.mdx b/fern/cudapages/cub/cub/cub/AgentRadixSortOnesweepPolicy.mdx
new file mode 100644
index 0000000..759c8c5
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/AgentRadixSortOnesweepPolicy.mdx
@@ -0,0 +1,51 @@
+---
+title: cub::AgentRadixSortOnesweepPolicy
+description: ""
+---
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**Inherits from:** `detail::RegBoundScaling< NominalBlockThreads4B, NominalItemsPerThread4B, ComputeT >` (public)
+
+---
+
+## Member variables
+
+| Name | Type | Description |
+|---|---|---|
+| `RANK_NUM_PARTS` static constexpr | `int` | |
+| `RADIX_BITS` static constexpr | `int` | |
+| `RANK_ALGORITHM` static constexpr | `RadixRankAlgorithm` | |
+| `SCAN_ALGORITHM` static constexpr | `BlockScanAlgorithm` | |
+| `STORE_ALGORITHM` static constexpr | `RadixSortStoreAlgorithm` | |
diff --git a/fern/cudapages/cub/cub/cub/AgentRadixSortUpsweepPolicy.mdx b/fern/cudapages/cub/cub/cub/AgentRadixSortUpsweepPolicy.mdx
new file mode 100644
index 0000000..794a204
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/AgentRadixSortUpsweepPolicy.mdx
@@ -0,0 +1,46 @@
+---
+title: cub::AgentRadixSortUpsweepPolicy
+description: "Parameterizable tuning policy type for AgentRadixSortUpsweep."
+---
+
+Parameterizable tuning policy type for AgentRadixSortUpsweep.
+
+
+
+
+
+Threads per thread block
+
+
+
+Items per thread (per tile of input)
+
+
+
+Dominant compute type
+
+
+
+Cache load modifier for reading keys
+
+
+
+The number of radix bits, i.e., log2(bins)
+
+
+
+
+
+
+
+
+**Inherits from:** `detail::RegBoundScaling< NominalBlockThreads4B, NominalItemsPerThread4B, ComputeT >` (public)
+
+---
+
+## Member variables
+
+| Name | Type | Description |
+|---|---|---|
+| `RADIX_BITS` static constexpr | `int` | The number of radix bits, i.e., log2(bins). |
+| `LOAD_MODIFIER` static constexpr | `CacheLoadModifier` | Cache load modifier for reading keys. |
diff --git a/fern/cudapages/cub/cub/cub/AgentReduceByKeyPolicy.mdx b/fern/cudapages/cub/cub/cub/AgentReduceByKeyPolicy.mdx
new file mode 100644
index 0000000..502fd02
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/AgentReduceByKeyPolicy.mdx
@@ -0,0 +1,48 @@
+---
+title: cub::AgentReduceByKeyPolicy
+description: "Parameterizable tuning policy type for AgentReduceByKey."
+---
+
+Parameterizable tuning policy type for AgentReduceByKey.
+
+
+
+
+
+Threads per thread block
+
+
+
+Items per thread (per tile of input)
+
+
+
+The [BlockLoad](/library/api/cub::_block_load) algorithm to use
+
+
+
+Cache load modifier for reading input elements
+
+
+
+The [BlockScan](/library/api/cub::_block_scan) algorithm to use
+
+
+
+Implementation detail, do not specify directly, requirements on the content of this type are subject to breaking change.
+
+
+
+
+
+---
+
+## Member variables
+
+| Name | Type | Description |
+|---|---|---|
+| `BLOCK_THREADS` static constexpr | `int` | < Threads per thread block |
+| `ITEMS_PER_THREAD` static constexpr | `int` | The [BlockLoad](/library/api/cub::_block_load) algorithm to use. |
+| `LOAD_ALGORITHM` static constexpr | `BlockLoadAlgorithm` | Cache load modifier for reading input elements. |
+| `LOAD_MODIFIER` static constexpr | `CacheLoadModifier` | The [BlockScan](/library/api/cub::_block_scan) algorithm to use. |
+| `SCAN_ALGORITHM` static constexpr | `BlockScanAlgorithm` | |
diff --git a/fern/cudapages/cub/cub/cub/AgentReducePolicy.mdx b/fern/cudapages/cub/cub/cub/AgentReducePolicy.mdx
new file mode 100644
index 0000000..2a57060
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/AgentReducePolicy.mdx
@@ -0,0 +1,51 @@
+---
+title: cub::AgentReducePolicy
+description: "Parameterizable tuning policy type for AgentReduce."
+---
+
+Parameterizable tuning policy type for AgentReduce.
+
+
+
+
+
+Threads per thread block
+
+
+
+Items per thread (per tile of input)
+
+
+
+Dominant compute type
+
+
+
+Number of items per vectorized load
+
+
+
+Cooperative block-wide reduction algorithm to use
+
+
+
+Cache load modifier for reading input elements
+
+
+
+
+
+
+
+
+**Inherits from:** `detail::MemBoundScaling< NominalBlockThreads4B, NominalItemsPerThread4B, ComputeT >` (public)
+
+---
+
+## Member variables
+
+| Name | Type | Description |
+|---|---|---|
+| `VECTOR_LOAD_LENGTH` static constexpr | `int` | Number of items per vectorized load. |
+| `BLOCK_ALGORITHM` static constexpr | `BlockReduceAlgorithm` | Cooperative block-wide reduction algorithm to use. |
+| `LOAD_MODIFIER` static constexpr | `CacheLoadModifier` | Cache load modifier for reading input elements. |
diff --git a/fern/cudapages/cub/cub/cub/AgentRlePolicy.mdx b/fern/cudapages/cub/cub/cub/AgentRlePolicy.mdx
new file mode 100644
index 0000000..05ee349
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/AgentRlePolicy.mdx
@@ -0,0 +1,53 @@
+---
+title: cub::AgentRlePolicy
+description: "Parameterizable tuning policy type for AgentRle."
+---
+
+Parameterizable tuning policy type for AgentRle.
+
+
+
+
+
+Threads per thread block
+
+
+
+Items per thread (per tile of input)
+
+
+
+The [BlockLoad](/library/api/cub::_block_load) algorithm to use
+
+
+
+Cache load modifier for reading input elements
+
+
+
+Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage)
+
+
+
+The [BlockScan](/library/api/cub::_block_scan) algorithm to use
+
+
+
+Implementation detail, do not specify directly, requirements on the content of this type are subject to breaking change.
+
+
+
+
+
+---
+
+## Member variables
+
+| Name | Type | Description |
+|---|---|---|
+| `BLOCK_THREADS` static constexpr | `int` | Threads per thread block. |
+| `ITEMS_PER_THREAD` static constexpr | `int` | Items per thread (per tile of input). |
+| `STORE_WARP_TIME_SLICING` static constexpr | `bool` | Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage). |
+| `LOAD_ALGORITHM` static constexpr | `BlockLoadAlgorithm` | The [BlockLoad](/library/api/cub::_block_load) algorithm to use. |
+| `LOAD_MODIFIER` static constexpr | `CacheLoadModifier` | Cache load modifier for reading input elements. |
+| `SCAN_ALGORITHM` static constexpr | `BlockScanAlgorithm` | The [BlockScan](/library/api/cub::_block_scan) algorithm to use. |
diff --git a/fern/cudapages/cub/cub/cub/AgentScanByKeyPolicy.mdx b/fern/cudapages/cub/cub/cub/AgentScanByKeyPolicy.mdx
new file mode 100644
index 0000000..a23df29
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/AgentScanByKeyPolicy.mdx
@@ -0,0 +1,47 @@
+---
+title: cub::AgentScanByKeyPolicy
+description: "Parameterizable tuning policy type for AgentScanByKey."
+---
+
+Parameterizable tuning policy type for AgentScanByKey.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Implementation detail, do not specify directly, requirements on the content of this type are subject to breaking change.
+
+
+
+
+
+---
+
+## Member variables
+
+| Name | Type | Description |
+|---|---|---|
+| `BLOCK_THREADS` static constexpr | `int` | |
+| `ITEMS_PER_THREAD` static constexpr | `int` | |
+| `LOAD_ALGORITHM` static constexpr | `BlockLoadAlgorithm` | |
+| `LOAD_MODIFIER` static constexpr | `CacheLoadModifier` | |
+| `SCAN_ALGORITHM` static constexpr | `BlockScanAlgorithm` | |
+| `STORE_ALGORITHM` static constexpr | `BlockStoreAlgorithm` | |
diff --git a/fern/cudapages/cub/cub/cub/AgentScanPolicy.mdx b/fern/cudapages/cub/cub/cub/AgentScanPolicy.mdx
new file mode 100644
index 0000000..6d049df
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/AgentScanPolicy.mdx
@@ -0,0 +1,60 @@
+---
+title: cub::AgentScanPolicy
+description: "Parameterizable tuning policy type for AgentScan."
+---
+
+Parameterizable tuning policy type for AgentScan.
+
+
+
+
+
+Threads per thread block
+
+
+
+Items per thread (per tile of input)
+
+
+
+Dominant compute type
+
+
+
+The [BlockLoad](/library/api/cub::_block_load) algorithm to use
+
+
+
+Cache load modifier for reading input elements
+
+
+
+The [BlockStore](/library/api/cub::_block_store) algorithm to use
+
+
+
+The [BlockScan](/library/api/cub::_block_scan) algorithm to use
+
+
+
+
+
+
+Implementation detail, do not specify directly, requirements on the content of this type are subject to breaking change.
+
+
+
+
+
+**Inherits from:** `detail::MemBoundScaling< NominalBlockThreads4B, NominalItemsPerThread4B, ComputeT >` (public)
+
+---
+
+## Member variables
+
+| Name | Type | Description |
+|---|---|---|
+| `LOAD_ALGORITHM` static constexpr | `BlockLoadAlgorithm` | |
+| `LOAD_MODIFIER` static constexpr | `CacheLoadModifier` | |
+| `STORE_ALGORITHM` static constexpr | `BlockStoreAlgorithm` | |
+| `SCAN_ALGORITHM` static constexpr | `BlockScanAlgorithm` | |
diff --git a/fern/cudapages/cub/cub/cub/AgentSelectIfPolicy.mdx b/fern/cudapages/cub/cub/cub/AgentSelectIfPolicy.mdx
new file mode 100644
index 0000000..7d7e5ca
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/AgentSelectIfPolicy.mdx
@@ -0,0 +1,48 @@
+---
+title: cub::AgentSelectIfPolicy
+description: "Parameterizable tuning policy type for AgentSelectIf."
+---
+
+Parameterizable tuning policy type for AgentSelectIf.
+
+
+
+
+
+Threads per thread block
+
+
+
+Items per thread (per tile of input)
+
+
+
+The [BlockLoad](/library/api/cub::_block_load) algorithm to use
+
+
+
+Cache load modifier for reading input elements
+
+
+
+The [BlockScan](/library/api/cub::_block_scan) algorithm to use
+
+
+
+Implementation detail, do not specify directly, requirements on the content of this type are subject to breaking change.
+
+
+
+
+
+---
+
+## Member variables
+
+| Name | Type | Description |
+|---|---|---|
+| `BLOCK_THREADS` static constexpr | `int` | Threads per thread block. |
+| `ITEMS_PER_THREAD` static constexpr | `int` | Items per thread (per tile of input). |
+| `LOAD_ALGORITHM` static constexpr | `BlockLoadAlgorithm` | The [BlockLoad](/library/api/cub::_block_load) algorithm to use. |
+| `LOAD_MODIFIER` static constexpr | `CacheLoadModifier` | Cache load modifier for reading input elements. |
+| `SCAN_ALGORITHM` static constexpr | `BlockScanAlgorithm` | The [BlockScan](/library/api/cub::_block_scan) algorithm to use. |
diff --git a/fern/cudapages/cub/cub/cub/AgentSubWarpMergeSortPolicy.mdx b/fern/cudapages/cub/cub/cub/AgentSubWarpMergeSortPolicy.mdx
new file mode 100644
index 0000000..651fba1
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/AgentSubWarpMergeSortPolicy.mdx
@@ -0,0 +1,43 @@
+---
+title: cub::AgentSubWarpMergeSortPolicy
+description: ""
+---
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+
+## Member variables
+
+| Name | Type | Description |
+|---|---|---|
+| `BLOCK_THREADS` static constexpr | `int` | |
+| `WARP_THREADS` static constexpr | `int` | |
+| `ITEMS_PER_THREAD` static constexpr | `int` | |
+| `ITEMS_PER_TILE` static constexpr | `int` | |
+| `SEGMENTS_PER_BLOCK` static constexpr | `int` | |
+| `LOAD_ALGORITHM` static constexpr | `cub::WarpLoadAlgorithm` | |
+| `LOAD_MODIFIER` static constexpr | `cub::CacheLoadModifier` | |
+| `STORE_ALGORITHM` static constexpr | `cub::WarpStoreAlgorithm` | |
diff --git a/fern/cudapages/cub/cub/cub/AgentThreeWayPartitionPolicy.mdx b/fern/cudapages/cub/cub/cub/AgentThreeWayPartitionPolicy.mdx
new file mode 100644
index 0000000..385a268
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/AgentThreeWayPartitionPolicy.mdx
@@ -0,0 +1,40 @@
+---
+title: cub::AgentThreeWayPartitionPolicy
+description: ""
+---
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+
+## Member variables
+
+| Name | Type | Description |
+|---|---|---|
+| `BLOCK_THREADS` static constexpr | `int` | |
+| `ITEMS_PER_THREAD` static constexpr | `int` | |
+| `LOAD_ALGORITHM` static constexpr | `BlockLoadAlgorithm` | |
+| `LOAD_MODIFIER` static constexpr | `CacheLoadModifier` | |
+| `SCAN_ALGORITHM` static constexpr | `BlockScanAlgorithm` | |
diff --git a/fern/cudapages/cub/cub/cub/AgentUniqueByKeyPolicy.mdx b/fern/cudapages/cub/cub/cub/AgentUniqueByKeyPolicy.mdx
new file mode 100644
index 0000000..07baee4
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/AgentUniqueByKeyPolicy.mdx
@@ -0,0 +1,43 @@
+---
+title: cub::AgentUniqueByKeyPolicy
+description: "Parameterizable tuning policy type for AgentUniqueByKey."
+---
+
+Parameterizable tuning policy type for AgentUniqueByKey.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Implementation detail, do not specify directly, requirements on the content of this type are subject to breaking change.
+
+
+
+
+
+---
+
+## Member variables
+
+| Name | Type | Description |
+|---|---|---|
+| `BLOCK_THREADS` static constexpr | `int` | |
+| `ITEMS_PER_THREAD` static constexpr | `int` | |
+| `LOAD_ALGORITHM` static constexpr | `cub::BlockLoadAlgorithm` | |
+| `LOAD_MODIFIER` static constexpr | `cub::CacheLoadModifier` | |
+| `SCAN_ALGORITHM` static constexpr | `cub::BlockScanAlgorithm` | |
diff --git a/fern/cudapages/cub/cub/cub/AgentWarpReducePolicy.mdx b/fern/cudapages/cub/cub/cub/AgentWarpReducePolicy.mdx
new file mode 100644
index 0000000..3154401
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/AgentWarpReducePolicy.mdx
@@ -0,0 +1,50 @@
+---
+title: cub::AgentWarpReducePolicy
+description: "Parameterizable tuning policy type for AgentReduce."
+---
+
+Parameterizable tuning policy type for AgentReduce.
+
+
+
+
+
+Threads per thread block
+
+
+
+Threads per warp
+
+
+
+Items per thread (per tile of input)
+
+
+
+Dominant compute type
+
+
+
+Number of items per vectorized load
+
+
+
+Cache load modifier for reading input elements
+
+
+
+
+
+---
+
+## Member variables
+
+| Name | Type | Description |
+|---|---|---|
+| `WARP_THREADS` static constexpr | `int` | Number of threads per warp. |
+| `VECTOR_LOAD_LENGTH` static constexpr | `int` | Number of items per vectorized load. |
+| `BLOCK_THREADS` static constexpr | `int` | Number of threads per block. |
+| `ITEMS_PER_THREAD` static constexpr | `int` | Number of items per thread. |
+| `LOAD_MODIFIER` static constexpr | `CacheLoadModifier` | Cache load modifier for reading input elements. |
+| `ITEMS_PER_TILE` static constexpr | `int` | Number of items per tile. |
+| `SEGMENTS_PER_BLOCK` static constexpr | `int` | Number of segments per block. |
diff --git a/fern/cudapages/cub/cub/cub/ArgIndexInputIterator.mdx b/fern/cudapages/cub/cub/cub/ArgIndexInputIterator.mdx
new file mode 100644
index 0000000..b2812f1
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/ArgIndexInputIterator.mdx
@@ -0,0 +1,290 @@
+---
+title: cub::ArgIndexInputIterator
+description: "A random-access input wrapper for pairing dereferenced values with their corresponding indices (forming `KeyValuePair` tuples)."
+---
+
+A random-access input wrapper for pairing dereferenced values with their corresponding indices (forming `KeyValuePair` tuples).
+
+**Overview**
+
+- `ArgIndexInputIterator` wraps a random access input iterator `itr` of type `InputIteratorT`. Dereferencing an `ArgIndexInputIterator` at offset `i` produces a `KeyValuePair` value whose `key` field is `i` and whose `value` field is [`itr`](/library/api/cub::_arg_index_input_iterator::itr)`[i]`.
+- Can be used with any data type.
+- Can be constructed, manipulated, and exchanged within and between host and device functions. Wrapped host memory can only be dereferenced on the host, and wrapped device memory can only be dereferenced on the device.
+- Compatible with Thrust API v1.7 or newer.
+
+**Snippet**
+
+The code snippet below illustrates the use of `ArgIndexInputIterator` to dereference an array of doubles
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+// Declare, allocate, and initialize a device array
+double *d_in; // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0]
+
+// Create an iterator wrapper
+cub::ArgIndexInputIterator itr(d_in);
+
+// Within device code:
+cub::ArgIndexInputIterator::value_type tup = *itr;
+printf("%f @ %ld\n",
+ tup.value,
+ tup.key); // 8.0 @ 0
+
+itr = itr + 6;
+tup = *itr;
+printf("%f @ %ld\n",
+ tup.value,
+ tup.key); // 9.0 @ 6
+```
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+
+
+
+The value type of the wrapped input iterator
+
+
+
+The difference type of this iterator (Default: `ptrdiff_t`)
+
+
+
+The paired value type of the <offset,value> tuple (Default: value type of input iterator)
+
+
+
+
+
+---
+
+## Constructors
+
+### ArgIndexInputIterator inline
+
+
+```cpp showLineNumbers={false}
+cub::ArgIndexInputIterator::ArgIndexInputIterator(
+ InputIteratorT itr,
+ difference_type offset = 0
+)
+```
+
+
+**Parameters**
+
+
+Input iterator to wrap
+
+
+
+OffsetT (in items) from `itr` denoting the position of the iterator
+
+
+---
+
+## Methods
+
+### operator++ inline
+
+
+
+
+Postfix increment.
+
+
+```cpp showLineNumbers={false}
+self_type cub::ArgIndexInputIterator::operator++(
+ int
+)
+```
+
+
+
+
+
+Prefix increment.
+
+
+```cpp showLineNumbers={false}
+self_type cub::ArgIndexInputIterator::operator++()
+```
+
+
+
+
+
+### operator* inline const
+
+Indirection.
+
+
+```cpp showLineNumbers={false}
+reference cub::ArgIndexInputIterator::operator*() const
+```
+
+
+### operator+ inline const
+
+Addition.
+
+
+```cpp showLineNumbers={false}
+template
+self_type cub::ArgIndexInputIterator::operator+(
+ Distance n
+) const
+```
+
+
+### operator+= inline
+
+Addition assignment.
+
+
+```cpp showLineNumbers={false}
+template
+self_type & cub::ArgIndexInputIterator::operator+=(
+ Distance n
+)
+```
+
+
+### operator- inline const
+
+
+
+
+Subtraction.
+
+
+```cpp showLineNumbers={false}
+template
+self_type cub::ArgIndexInputIterator::operator-(
+ Distance n
+) const
+```
+
+
+
+
+
+Distance.
+
+
+```cpp showLineNumbers={false}
+difference_type cub::ArgIndexInputIterator::operator-(
+ self_type other
+) const
+```
+
+
+
+
+
+### operator-= inline
+
+Subtraction assignment.
+
+
+```cpp showLineNumbers={false}
+template
+self_type & cub::ArgIndexInputIterator::operator-=(
+ Distance n
+)
+```
+
+
+### operator[] inline const
+
+Array subscript.
+
+
+```cpp showLineNumbers={false}
+template
+reference cub::ArgIndexInputIterator::operator[](
+ Distance n
+) const
+```
+
+
+### operator-> inline
+
+Structure dereference.
+
+
+```cpp showLineNumbers={false}
+pointer cub::ArgIndexInputIterator::operator->()
+```
+
+
+### operator== inline
+
+Equal to.
+
+
+```cpp showLineNumbers={false}
+bool cub::ArgIndexInputIterator::operator==(
+ const self_type &rhs
+)
+```
+
+
+### operator!= inline
+
+Not equal to.
+
+
+```cpp showLineNumbers={false}
+bool cub::ArgIndexInputIterator::operator!=(
+ const self_type &rhs
+)
+```
+
+
+### normalize inline
+
+Normalize.
+
+
+```cpp showLineNumbers={false}
+void cub::ArgIndexInputIterator::normalize()
+```
+
+
+### operator<< inline
+
+
+```cpp showLineNumbers={false}
+friend::std::ostream & cub::ArgIndexInputIterator::operator<<(
+ ::std::ostream &os,
+ const self_type &
+)
+```
+
+
+---
+
+## Types
+
+### Typedefs
+
+| Name | Definition | Description |
+|---|---|---|
+| `self_type` | `ArgIndexInputIterator` | My own type. |
+| `difference_type` | `OffsetT` | Type to express the result of subtracting one iterator from another. |
+| `value_type` | `KeyValuePair< difference_type, OutputValueT >` | The type of the element the iterator can point to. |
+| `pointer` | `value_type *` | The type of a pointer to an element the iterator can point to. |
+| `reference` | `value_type` | The type of a reference to an element the iterator can point to. |
+| `iterator_category` | `THRUST_NS_QUALIFIER::detail::iterator_facade_category_t< THRUST_NS_QUALIFIER::any_system_tag, THRUST_NS_QUALIFIER::random_access_traversal_tag >` | The iterator category. |
+
+---
+
+## Member variables
+
+| Name | Type | Description |
+|---|---|---|
+| `itr` | `InputIteratorT` | |
+| `offset` | `difference_type` | |
diff --git a/fern/cudapages/cub/cub/cub/ArgMax.mdx b/fern/cudapages/cub/cub/cub/ArgMax.mdx
new file mode 100644
index 0000000..0aadeef
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/ArgMax.mdx
@@ -0,0 +1,24 @@
+---
+title: cub::ArgMax
+description: "Arg max functor (keeps the value and offset of the first occurrence of the larger item)."
+---
+
+Arg max functor (keeps the value and offset of the first occurrence of the larger item).
+
+---
+
+## Methods
+
+### operator() inline const
+
+Boolean max operator, preferring the item having the smaller offset in case of ties.
+
+
+```cpp showLineNumbers={false}
+template
+KeyValuePair cub::ArgMax::operator()(
+ const KeyValuePair &a,
+ const KeyValuePair &b
+) const
+```
+
diff --git a/fern/cudapages/cub/cub/cub/ArgMin.mdx b/fern/cudapages/cub/cub/cub/ArgMin.mdx
new file mode 100644
index 0000000..9b17384
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/ArgMin.mdx
@@ -0,0 +1,24 @@
+---
+title: cub::ArgMin
+description: "Arg min functor (keeps the value and offset of the first occurrence of the smallest item)."
+---
+
+Arg min functor (keeps the value and offset of the first occurrence of the smallest item).
+
+---
+
+## Methods
+
+### operator() inline const
+
+Boolean min operator, preferring the item having the smaller offset in case of ties.
+
+
+```cpp showLineNumbers={false}
+template
+KeyValuePair cub::ArgMin::operator()(
+ const KeyValuePair &a,
+ const KeyValuePair &b
+) const
+```
+
diff --git a/fern/cudapages/cub/cub/cub/BFEDigitExtractor.mdx b/fern/cudapages/cub/cub/cub/BFEDigitExtractor.mdx
new file mode 100644
index 0000000..5d7be9b
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/BFEDigitExtractor.mdx
@@ -0,0 +1,82 @@
+---
+title: cub::BFEDigitExtractor
+description: "A wrapper type to extract digits."
+---
+
+A wrapper type to extract digits.
+
+Uses the BFE intrinsic to extract a key from a digit.
+
+
+
+
+
+
+
+
+
+
+**Inherits from:** `cub::BaseDigitExtractor< KeyT >` (public)
+
+---
+
+## Constructors
+
+### BFEDigitExtractor inline explicit
+
+
+```cpp showLineNumbers={false}
+cub::BFEDigitExtractor::BFEDigitExtractor(
+ ::cuda::std::uint32_t bit_start = 0,
+ ::cuda::std::uint32_t num_bits = 0
+)
+```
+
+
+---
+
+## Methods
+
+### Digit inline const
+
+
+```cpp showLineNumbers={false}
+::cuda::std::uint32_t cub::BFEDigitExtractor::Digit(
+ UnsignedBits key
+) const
+```
+
+
+---
+
+## Static methods
+
+### ProcessFloatMinusZero inline static
+
+
+```cpp showLineNumbers={false}
+static UnsignedBits cub::BaseDigitExtractor::ProcessFloatMinusZero(
+ UnsignedBits key
+)
+```
+
+
+---
+
+## Types
+
+### Typedefs
+
+| Name | Definition |
+|---|---|
+| `TraitsT` | `Traits< KeyT >` |
+| `UnsignedBits` | `typename TraitsT::UnsignedBits` |
+
+---
+
+## Member variables
+
+| Name | Type | Description |
+|---|---|---|
+| `bit_start` | `::cuda::std::uint32_t` | |
+| `num_bits` | `::cuda::std::uint32_t` | |
diff --git a/fern/cudapages/cub/cub/cub/BaseDigitExtractor.mdx b/fern/cudapages/cub/cub/cub/BaseDigitExtractor.mdx
new file mode 100644
index 0000000..22db328
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/BaseDigitExtractor.mdx
@@ -0,0 +1,49 @@
+---
+title: cub::BaseDigitExtractor
+description: "Base struct for digit extractor."
+---
+
+Base struct for digit extractor.
+
+Contains common code to provide special handling for floating-point -0.0.
+
+
+This handles correctly both the case when the keys are bitwise-complemented after twiddling for descending sort (in onesweep) as well as when the keys are not bit-negated, but the implementation handles descending sort separately (in other implementations in CUB). Twiddling alone maps -0.0f to 0x7fffffff and +0.0f to 0x80000000 for float, which are subsequent bit patterns and bitwise complements of each other. For onesweep, both -0.0f and +0.0f are mapped to the bit pattern of +0.0f (0x80000000) for ascending sort, and to the pattern of -0.0f (0x7fffffff) for descending sort. For all other sorting implementations in CUB, both are always mapped to +0.0f. Since bit patterns for both -0.0f and +0.0f are next to each other and only one of them is used, the sorting works correctly. For double, the same applies, but with 64-bit patterns.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+
+## Static methods
+
+### ProcessFloatMinusZero inline static
+
+
+```cpp showLineNumbers={false}
+static UnsignedBits cub::BaseDigitExtractor::ProcessFloatMinusZero(
+ UnsignedBits key
+)
+```
+
+
+---
+
+## Types
+
+### Typedefs
+
+| Name | Definition |
+|---|---|
+| `TraitsT` | `Traits< KeyT >` |
+| `UnsignedBits` | `typename TraitsT::UnsignedBits` |
diff --git a/fern/cudapages/cub/cub/cub/BaseDigitExtractor_KeyT_true.mdx b/fern/cudapages/cub/cub/cub/BaseDigitExtractor_KeyT_true.mdx
new file mode 100644
index 0000000..86098fe
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/BaseDigitExtractor_KeyT_true.mdx
@@ -0,0 +1,38 @@
+---
+title: "cub::BaseDigitExtractor< KeyT, true >"
+description: ""
+---
+
+
+
+
+
+
+
+
+
+
+---
+
+## Static methods
+
+### ProcessFloatMinusZero inline static
+
+": "/library/api/cub::BaseDigitExtractor%3C KeyT, true %3E"}}>
+```cpp showLineNumbers={false}
+static UnsignedBits cub::BaseDigitExtractor::ProcessFloatMinusZero(
+ UnsignedBits key
+)
+```
+
+
+---
+
+## Types
+
+### Typedefs
+
+| Name | Definition |
+|---|---|
+| `TraitsT` | `Traits< KeyT >` |
+| `UnsignedBits` | `typename TraitsT::UnsignedBits` |
diff --git a/fern/cudapages/cub/cub/cub/BlockAdjacentDifference.mdx b/fern/cudapages/cub/cub/cub/BlockAdjacentDifference.mdx
new file mode 100644
index 0000000..c3586f8
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/BlockAdjacentDifference.mdx
@@ -0,0 +1,762 @@
+---
+title: cub::BlockAdjacentDifference
+description: ""
+---
+
+BlockAdjacentDifference provides collective methods for computing the differences of adjacent elements partitioned across a CUDA thread block.
+
+## Example
+
+The code snippet below illustrates how to use BlockAdjacentDifference to compute the left difference between adjacent elements.
+
+`{ [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4], ... }`. The corresponding output `result` in those threads will be `{ [4,-2,-1,0], [0,0,0,0], [1,1,0,0], [0,1,-3,3], ... }`.
+
+```cpp showLineNumbers={false}
+#include
+// or equivalently
+
+struct CustomDifference
+{
+ template
+ __host__ DataType operator()(DataType &lhs, DataType &rhs)
+ {
+ return lhs - rhs;
+ }
+};
+
+__global__ void ExampleKernel(...)
+{
+ // Specialize BlockAdjacentDifference for a 1D block of
+ // 128 threads of type int
+ using BlockAdjacentDifferenceT =
+ cub::BlockAdjacentDifference;
+
+ // Allocate shared memory for BlockAdjacentDifference
+ __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
+
+ // Obtain a segment of consecutive items that are blocked across threads
+ int thread_data[4];
+ ...
+
+ // Collectively compute adjacent_difference
+ int result[4];
+
+ BlockAdjacentDifferenceT(temp_storage).SubtractLeft(thread_data, result,
+ CustomDifference());
+}
+```
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+
+## Collective constructors
+
+### BlockAdjacentDifference inline
+
+
+
+
+Collective constructor using a private static allocation of shared memory as temporary storage.
+
+
+```cpp showLineNumbers={false}
+cub::BlockAdjacentDifference::BlockAdjacentDifference()
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+
+
+Collective constructor using the specified memory allocation as temporary storage.
+
+
+```cpp showLineNumbers={false}
+cub::BlockAdjacentDifference::BlockAdjacentDifference(
+ TempStorage &temp_storage
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+**Parameters**
+
+
+Reference to memory allocation having layout type [TempStorage](/library/api/cub::BlockAdjacentDifference::TempStorage)
+
+
+
+
+
+---
+
+## Read left operations
+
+### SubtractLeft inline
+
+
+
+
+Subtracts the left element of each adjacent pair of elements partitioned across a CUDA thread block.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockAdjacentDifference::SubtractLeft(
+ T (&input)[ITEMS_PER_THREAD],
+ OutputType (&output)[ITEMS_PER_THREAD],
+ DifferenceOpT difference_op
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+Assumes threads are in row-major order.
+
+
+
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Parameters**
+
+
+Calling thread's input items (may be aliased to `output`)
+
+
+
+Calling thread's adjacent difference result
+
+
+
+Binary difference operator
+
+
+**Example**
+
+The code snippet below illustrates how to use BlockAdjacentDifference to compute the left difference between adjacent elements.
+
+`{ [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4], ... }`. The corresponding output `result` in those threads will be `{ [4,-2,-1,0], [0,0,0,0], [1,1,0,0], [0,1,-3,3], ... }`.
+
+```cpp showLineNumbers={false}
+#include
+// or equivalently
+
+struct CustomDifference
+{
+ template
+ __host__ DataType operator()(DataType &lhs, DataType &rhs)
+ {
+ return lhs - rhs;
+ }
+};
+
+__global__ void ExampleKernel(...)
+{
+ // Specialize BlockAdjacentDifference for a 1D block
+ // of 128 threads of type int
+ using BlockAdjacentDifferenceT =
+ cub::BlockAdjacentDifference;
+
+ // Allocate shared memory for BlockAdjacentDifference
+ __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
+
+ // Obtain a segment of consecutive items that are blocked across threads
+ int thread_data[4];
+ ...
+
+ // Collectively compute adjacent_difference
+ BlockAdjacentDifferenceT(temp_storage).SubtractLeft(thread_data, thread_data,
+ CustomDifference());
+}
+```
+
+
+
+
+Subtracts the left element of each adjacent pair of elements partitioned across a CUDA thread block.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockAdjacentDifference::SubtractLeft(
+ T (&input)[ITEMS_PER_THREAD],
+ OutputT (&output)[ITEMS_PER_THREAD],
+ DifferenceOpT difference_op,
+ T tile_predecessor_item
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+Assumes threads are in row-major order.
+
+
+
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Parameters**
+
+
+Calling thread's input items (may be aliased to `output`)
+
+
+
+Calling thread's adjacent difference result
+
+
+
+Binary difference operator
+
+
+
+Embed:rst:leading-asterisk
+//! *thread*\ :sub:`0` only item which is going to be subtracted from the first tile item
+//! (*input*\ :sub:`0` from *thread*\ :sub:`0`).
+//!
+
+
+**Example**
+
+The code snippet below illustrates how to use BlockAdjacentDifference to compute the left difference between adjacent elements.
+
+`{ [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4], ... }`. and that `tile_predecessor_item` is `3`. The corresponding output `result` in those threads will be `{ [1,-2,-1,0], [0,0,0,0], [1,1,0,0], [0,1,-3,3], ... }`.
+
+```cpp showLineNumbers={false}
+#include
+// or equivalently
+
+struct CustomDifference
+{
+ template
+ __host__ DataType operator()(DataType &lhs, DataType &rhs)
+ {
+ return lhs - rhs;
+ }
+};
+
+__global__ void ExampleKernel(...)
+{
+ // Specialize BlockAdjacentDifference for a 1D block of
+ // 128 threads of type int
+ using BlockAdjacentDifferenceT =
+ cub::BlockAdjacentDifference;
+
+ // Allocate shared memory for BlockAdjacentDifference
+ __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
+
+ // Obtain a segment of consecutive items that are blocked across threads
+ int thread_data[4];
+ ...
+
+ // The last item in the previous tile:
+ int tile_predecessor_item = ...;
+
+ // Collectively compute adjacent_difference
+ BlockAdjacentDifferenceT(temp_storage).SubtractLeft(
+ thread_data,
+ thread_data,
+ CustomDifference(),
+ tile_predecessor_item);
+```
+
+
+
+
+### SubtractLeftPartialTile inline
+
+
+
+
+Subtracts the left element of each adjacent pair of elements partitioned across a CUDA thread block.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockAdjacentDifference::SubtractLeftPartialTile(
+ T (&input)[ITEMS_PER_THREAD],
+ OutputType (&output)[ITEMS_PER_THREAD],
+ DifferenceOpT difference_op,
+ int valid_items
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+Assumes threads are in row-major order.
+
+
+
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Parameters**
+
+
+Calling thread's input items (may be aliased to `output`)
+
+
+
+Calling thread's adjacent difference result
+
+
+
+Binary difference operator
+
+
+
+Number of valid items in thread block
+
+
+**Example**
+
+The code snippet below illustrates how to use BlockAdjacentDifference to compute the left difference between adjacent elements.
+
+`{ [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4], ... }`. The corresponding output `result` in those threads will be `{ [4,-2,-1,0], [0,0,0,0], [1,3,3,3], [3,4,1,4], ... }`.
+
+```cpp showLineNumbers={false}
+#include
+// or equivalently
+
+struct CustomDifference
+{
+ template
+ __host__ DataType operator()(DataType &lhs, DataType &rhs)
+ {
+ return lhs - rhs;
+ }
+};
+
+__global__ void ExampleKernel(...)
+{
+ // Specialize BlockAdjacentDifference for a 1D block of
+ // 128 threads of type int
+ using BlockAdjacentDifferenceT =
+ cub::BlockAdjacentDifference;
+
+ // Allocate shared memory for BlockAdjacentDifference
+ __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
+
+ // Obtain a segment of consecutive items that are blocked across threads
+ int thread_data[4];
+ ...
+ int valid_items = 9;
+
+ // Collectively compute adjacent_difference
+ BlockAdjacentDifferenceT(temp_storage).SubtractLeftPartialTile(
+ thread_data,
+ thread_data,
+ CustomDifference(),
+ valid_items);
+```
+
+
+
+
+Subtracts the left element of each adjacent pair of elements partitioned across a CUDA thread block.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockAdjacentDifference::SubtractLeftPartialTile(
+ T (&input)[ITEMS_PER_THREAD],
+ OutputType (&output)[ITEMS_PER_THREAD],
+ DifferenceOpT difference_op,
+ int valid_items,
+ T tile_predecessor_item
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+Assumes threads are in row-major order.
+
+
+
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Parameters**
+
+
+Calling thread's input items (may be aliased to `output`)
+
+
+
+Calling thread's adjacent difference result
+
+
+
+Binary difference operator
+
+
+
+Number of valid items in thread block
+
+
+
+Embed:rst:leading-asterisk
+//! *thread*\ :sub:`0` only item which is going to be subtracted from the first tile item
+//! (*input*\ :sub:`0` from *thread*\ :sub:`0`).
+//!
+
+
+**Example**
+
+The code snippet below illustrates how to use BlockAdjacentDifference to compute the left difference between adjacent elements.
+
+`{ [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4], ... }`. The corresponding output `result` in those threads will be `{ [0,-2,-1,0], [0,0,0,0], [1,3,3,3], [3,4,1,4], ... }`.
+
+```cpp showLineNumbers={false}
+#include
+// or equivalently
+
+struct CustomDifference
+{
+ template
+ __host__ DataType operator()(DataType &lhs, DataType &rhs)
+ {
+ return lhs - rhs;
+ }
+};
+
+__global__ void ExampleKernel(...)
+{
+ // Specialize BlockAdjacentDifference for a 1D block of
+ // 128 threads of type int
+ using BlockAdjacentDifferenceT =
+ cub::BlockAdjacentDifference;
+
+ // Allocate shared memory for BlockAdjacentDifference
+ __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
+
+ // Obtain a segment of consecutive items that are blocked across threads
+ int thread_data[4];
+ ...
+ int valid_items = 9;
+ int tile_predecessor_item = 4;
+
+ // Collectively compute adjacent_difference
+ BlockAdjacentDifferenceT(temp_storage).SubtractLeftPartialTile(
+ thread_data,
+ thread_data,
+ CustomDifference(),
+ valid_items,
+ tile_predecessor_item);
+```
+
+
+
+
+---
+
+## Read right operations
+
+### SubtractRight inline
+
+
+
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockAdjacentDifference::SubtractRight(
+ T (&input)[ITEMS_PER_THREAD],
+ OutputT (&output)[ITEMS_PER_THREAD],
+ DifferenceOpT difference_op
+)
+```
+
+
+
+
+
+Subtracts the right element of each adjacent pair of elements partitioned across a CUDA thread block.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockAdjacentDifference::SubtractRight(
+ T (&input)[ITEMS_PER_THREAD],
+ OutputT (&output)[ITEMS_PER_THREAD],
+ DifferenceOpT difference_op,
+ T tile_successor_item
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+Assumes threads are in row-major order.
+
+
+
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Parameters**
+
+
+Calling thread's input items (may be aliased to `output`)
+
+
+
+Calling thread's adjacent difference result
+
+
+
+Binary difference operator
+
+
+
+Embed:rst:leading-asterisk
+//! *thread*\ :sub:`BLOCK_THREADS` only item which is going to be subtracted from the last tile item
+//! (*input*\ :sub:`ITEMS_PER_THREAD` from *thread*\ :sub:`BLOCK_THREADS`).
+//!
+
+
+**Example**
+
+The code snippet below illustrates how to use BlockAdjacentDifference to compute the right difference between adjacent elements.
+
+`{ ...3], [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4] }`, and that `tile_successor_item` is `3`. The corresponding output `result` in those threads will be `{ ...-1, [2,1,0,0], [0,0,0,-1], [-1,0,0,0], [-1,3,-3,1] }`.
+
+```cpp showLineNumbers={false}
+#include
+// or equivalently
+
+struct CustomDifference
+{
+ template
+ __host__ DataType operator()(DataType &lhs, DataType &rhs)
+ {
+ return lhs - rhs;
+ }
+};
+
+__global__ void ExampleKernel(...)
+{
+ // Specialize BlockAdjacentDifference for a 1D block of
+ // 128 threads of type int
+ using BlockAdjacentDifferenceT =
+ cub::BlockAdjacentDifference;
+
+ // Allocate shared memory for BlockAdjacentDifference
+ __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
+
+ // Obtain a segment of consecutive items that are blocked across threads
+ int thread_data[4];
+ ...
+
+ // The first item in the next tile:
+ int tile_successor_item = ...;
+
+ // Collectively compute adjacent_difference
+ BlockAdjacentDifferenceT(temp_storage).SubtractRight(
+ thread_data,
+ thread_data,
+ CustomDifference(),
+ tile_successor_item);
+```
+
+
+
+
+### SubtractRightPartialTile inline
+
+Subtracts the right element of each adjacent pair in range of elements partitioned across a CUDA thread block.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockAdjacentDifference::SubtractRightPartialTile(
+ T (&input)[ITEMS_PER_THREAD],
+ OutputT (&output)[ITEMS_PER_THREAD],
+ DifferenceOpT difference_op,
+ int valid_items
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+Assumes threads are in row-major order.
+
+
+
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Parameters**
+
+
+Calling thread's input items (may be aliased to `output`)
+
+
+
+Calling thread's adjacent difference result
+
+
+
+Binary difference operator
+
+
+
+Number of valid items in thread block
+
+
+**Example**
+
+The code snippet below illustrates how to use BlockAdjacentDifference to compute the right difference between adjacent elements.
+
+`{ ...3], [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4] }`. and that `valid_items` is `507`. The corresponding output `result` in those threads will be `{ ...-1, [2,1,0,0], [0,0,0,-1], [-1,0,3,3], [3,4,1,4] }`.
+
+```cpp showLineNumbers={false}
+#include
+// or equivalently
+
+struct CustomDifference
+{
+ template
+ __host__ DataType operator()(DataType &lhs, DataType &rhs)
+ {
+ return lhs - rhs;
+ }
+};
+
+__global__ void ExampleKernel(...)
+{
+ // Specialize BlockAdjacentDifference for a 1D block of
+ // 128 threads of type int
+ using BlockAdjacentDifferenceT =
+ cub::BlockAdjacentDifference;
+
+ // Allocate shared memory for BlockAdjacentDifference
+ __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
+
+ // Obtain a segment of consecutive items that are blocked across threads
+ int thread_data[4];
+ ...
+
+ // Collectively compute adjacent_difference
+ BlockAdjacentDifferenceT(temp_storage).SubtractRightPartialTile(
+ thread_data,
+ thread_data,
+ CustomDifference(),
+ valid_items);
+```
+
+---
+
+## Utility methods
+
+### PrivateStorage inline
+
+Internal storage allocator.
+
+
+```cpp showLineNumbers={false}
+_TempStorage & cub::BlockAdjacentDifference::PrivateStorage()
+```
+
+
+---
+
+## Member variables
+
+| Name | Type | Description |
+|---|---|---|
+| `BLOCK_THREADS` static constexpr | `int` | The thread block size in threads. |
+| `temp_storage` | `_TempStorage &` | Shared storage reference. |
+| `linear_tid` | `unsigned int` | Linear thread-id. |
+
+---
+
+## Inner classes
+
+### _TempStorage
+
+
+```cpp showLineNumbers={false}
+struct cub::BlockAdjacentDifference::_TempStorage
+```
+
+
+Shared memory storage layout type (last element from each thread's input).
+
+| Name | Type | Description |
+|---|---|---|
+| `first_items` | `T` | |
+| `last_items` | `T` | |
+
+### ApplyOp
+
+
+```cpp showLineNumbers={false}
+struct cub::BlockAdjacentDifference::ApplyOp
+```
+
+
+Specialization for when FlagOp has third index param.
+
+### ApplyOp< FlagOp, false >
+
+
+```cpp showLineNumbers={false}
+struct cub::BlockAdjacentDifference::ApplyOp< FlagOp, false >
+```
+
+
+Specialization for when FlagOp does not have a third index param.
+
+### Iterate
+
+
+```cpp showLineNumbers={false}
+struct cub::BlockAdjacentDifference::Iterate
+```
+
+
+Templated unrolling of item comparison (inductive case).
+
+### TempStorage
+
+
+```cpp showLineNumbers={false}
+struct cub::BlockAdjacentDifference::TempStorage
+```
+
+
+The operations exposed by `BlockAdjacentDifference` require a temporary memory allocation of this nested type for thread communication. This opaque storage can be allocated directly using the `__shared__` keyword. Alternatively, it can be aliased to externally allocated memory (shared or global) or `union`'d with other storage allocation types to facilitate memory reuse.
+
+**Inherits from:** `Uninitialized< _TempStorage >` (public)
diff --git a/fern/cudapages/cub/cub/cub/BlockDiscontinuity.mdx b/fern/cudapages/cub/cub/cub/BlockDiscontinuity.mdx
new file mode 100644
index 0000000..f34d82a
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/BlockDiscontinuity.mdx
@@ -0,0 +1,1003 @@
+---
+title: cub::BlockDiscontinuity
+description: ""
+---
+
+The BlockDiscontinuity class provides collective methods for flagging discontinuities within an ordered set of items partitioned across a CUDA thread block.
+
+## Performance considerations
+
+- Performance is sensitive to the degree of data movement across the block.
+- Incurs zero bank conflicts for most types
+
+## Example
+
+The code snippet below illustrates the head flagging of 512 integer items that are partitioned in a blocked arrangement across 128 threads where each thread owns 4 consecutive items.
+
+`{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }`. The corresponding output `head_flags` in those threads will be `{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }`.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+__global__ void ExampleKernel(...)
+{
+ // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int
+ using BlockDiscontinuity = cub::BlockDiscontinuity;
+
+ // Allocate shared memory for BlockDiscontinuity
+ __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+
+ // Obtain a segment of consecutive items that are blocked across threads
+ int thread_data[4];
+ ...
+
+ // Collectively compute head flags for discontinuities in the segment
+ int head_flags[4];
+ BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data, cub::Inequality());
+}
+```
+
+
+
+
+
+The data type to be flagged.
+
+
+
+The thread block length in threads along the X dimension
+
+
+
+**[optional]** The thread block length in threads along the Y dimension (default: 1)
+
+
+
+**[optional]** The thread block length in threads along the Z dimension (default: 1)
+
+
+
+
+
+---
+
+## Collective constructors
+
+### BlockDiscontinuity inline
+
+
+
+
+Collective constructor using a private static allocation of shared memory as temporary storage.
+
+
+```cpp showLineNumbers={false}
+cub::BlockDiscontinuity::BlockDiscontinuity()
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+
+
+Collective constructor using the specified memory allocation as temporary storage.
+
+
+```cpp showLineNumbers={false}
+cub::BlockDiscontinuity::BlockDiscontinuity(
+ TempStorage &temp_storage
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+**Parameters**
+
+
+Reference to memory allocation having layout type [TempStorage](/library/api/cub::BlockDiscontinuity::TempStorage)
+
+
+
+
+
+---
+
+## Head flag operations
+
+### FlagHeads inline
+
+
+
+
+Sets head flags indicating discontinuities between items partitioned across the thread block, for which the first item has no reference and is always flagged.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockDiscontinuity::FlagHeads(
+ FlagT (&head_flags)[ITEMS_PER_THREAD],
+ T (&input)[ITEMS_PER_THREAD],
+ FlagOp flag_op
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The flag `head_flags[i]` is set for item `input[i]` when `flag_op(previous-item, input[i])` returns `true` (where `previous-item` is either the preceding item in the same thread or the last item in the previous thread).
+For *thread*0, item `input[0]` is always flagged.
+Data is in a blocked arrangement across threads.
+Performance is sensitive to the degree of data movement across the block.
+
+
+
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Template parameters**
+
+
+**[inferred]** The number of consecutive items partitioned onto each thread
+
+
+
+**[inferred]** The flag type (must be an integer type)
+
+
+
+**[inferred]** Binary predicate functor type having member `T operator()(const T &a, const T &b)` or member `T operator()(const T &a, const T &b, unsigned int b_index)`, and returning `true` if a discontinuity exists between `a` and `b`, otherwise `false`. `b_index` is the rank of b in the aggregate tile of data.
+
+
+**Parameters**
+
+
+Calling thread's discontinuity head_flags
+
+
+
+Calling thread's input items
+
+
+
+Binary boolean flag predicate
+
+
+**Example**
+
+The code snippet below illustrates the head-flagging of 512 integer items that are partitioned in a blocked arrangement across 128 threads where each thread owns 4 consecutive items.
+
+`{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }`. The corresponding output `head_flags` in those threads will be `{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }`.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+__global__ void ExampleKernel(...)
+{
+ // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int
+ using BlockDiscontinuity = cub::BlockDiscontinuity;
+
+ // Allocate shared memory for BlockDiscontinuity
+ __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+
+ // Obtain a segment of consecutive items that are blocked across threads
+ int thread_data[4];
+ ...
+
+ // Collectively compute head flags for discontinuities in the segment
+ int head_flags[4];
+ BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data, cub::Inequality());
+}
+```
+
+
+
+
+Sets head flags indicating discontinuities between items partitioned across the thread block.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockDiscontinuity::FlagHeads(
+ FlagT (&head_flags)[ITEMS_PER_THREAD],
+ T (&input)[ITEMS_PER_THREAD],
+ FlagOp flag_op,
+ T tile_predecessor_item
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The flag `head_flags[i]` is set for item `input[i]` when `flag_op(previous-item, input[i])` returns `true` (where `previous-item` is either the preceding item in the same thread or the last item in the previous thread).
+For *thread*0, item `input[0]` is compared against `tile_predecessor_item`.
+Data is in a blocked arrangement across threads.
+Performance is sensitive to the degree of data movement across the block.
+
+
+
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Template parameters**
+
+
+**[inferred]** The number of consecutive items partitioned onto each thread.
+
+
+
+**[inferred]** The flag type (must be an integer type)
+
+
+
+**[inferred]** Binary predicate functor type having member `T operator()(const T &a, const T &b)` or member `T operator()(const T &a, const T &b, unsigned int b_index)`, and returning `true` if a discontinuity exists between `a` and `b`, otherwise `false`. `b_index` is the rank of b in the aggregate tile of data.
+
+
+**Parameters**
+
+
+Calling thread's discontinuity `head_flags`
+
+
+
+Calling thread's input items
+
+
+
+Binary boolean flag predicate
+
+
+
+Embed:rst:leading-asterisk
+//! *thread*\ :sub:`0` only item with which to compare the first tile item (``input[0]`` from *thread*\ :sub:`0`).
+//!
+
+
+**Example**
+
+The code snippet below illustrates the head-flagging of 512 integer items that are partitioned in a blocked arrangement across 128 threads where each thread owns 4 consecutive items.
+
+`{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }`, and that `tile_predecessor_item` is `0`. The corresponding output `head_flags` in those threads will be `{ [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }`.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+__global__ void ExampleKernel(...)
+{
+ // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int
+ using BlockDiscontinuity = cub::BlockDiscontinuity;
+
+ // Allocate shared memory for BlockDiscontinuity
+ __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+
+ // Obtain a segment of consecutive items that are blocked across threads
+ int thread_data[4];
+ ...
+
+ // Have thread0 obtain the predecessor item for the entire tile
+ int tile_predecessor_item;
+ if (threadIdx.x == 0) tile_predecessor_item == ...
+
+ // Collectively compute head flags for discontinuities in the segment
+ int head_flags[4];
+ BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data,
+ cub::Inequality(), tile_predecessor_item);
+}
+```
+
+
+
+
+---
+
+## Tail flag operations
+
+### FlagTails inline
+
+
+
+
+Sets tail flags indicating discontinuities between items partitioned across the thread block, for which the last item has no reference and is always flagged.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockDiscontinuity::FlagTails(
+ FlagT (&tail_flags)[ITEMS_PER_THREAD],
+ T (&input)[ITEMS_PER_THREAD],
+ FlagOp flag_op
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The flag `tail_flags[i]` is set for item `input[i]` when `flag_op(input[i], next-item)` returns `true` (where `next-item` is either the next item in the same thread or the first item in the next thread).
+For *thread*BLOCK_THREADS - 1, item `input[ITEMS_PER_THREAD - 1]` is always flagged.
+Data is in a blocked arrangement across threads.
+Performance is sensitive to the degree of data movement across the block.
+
+
+
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Template parameters**
+
+
+**[inferred]** The number of consecutive items partitioned onto each thread.
+
+
+
+**[inferred]** The flag type (must be an integer type)
+
+
+
+**[inferred]** Binary predicate functor type having member `T operator()(const T &a, const T &b)` or member `T operator()(const T &a, const T &b, unsigned int b_index)`, and returning `true` if a discontinuity exists between `a` and `b`, otherwise `false`. `b_index` is the rank of `b` in the aggregate tile of data.
+
+
+**Parameters**
+
+
+Calling thread's discontinuity tail_flags
+
+
+
+Calling thread's input items
+
+
+
+Binary boolean flag predicate
+
+
+**Example**
+
+The code snippet below illustrates the tail-flagging of 512 integer items that are partitioned in a blocked arrangement across 128 threads where each thread owns 4 consecutive items.
+
+`{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }`. The corresponding output `tail_flags` in those threads will be `{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }`.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+__global__ void ExampleKernel(...)
+{
+ // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int
+ using BlockDiscontinuity = cub::BlockDiscontinuity;
+
+ // Allocate shared memory for BlockDiscontinuity
+ __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+
+ // Obtain a segment of consecutive items that are blocked across threads
+ int thread_data[4];
+ ...
+
+ // Collectively compute tail flags for discontinuities in the segment
+ int tail_flags[4];
+ BlockDiscontinuity(temp_storage).FlagTails(tail_flags, thread_data, cub::Inequality());
+}
+```
+
+
+
+
+Sets tail flags indicating discontinuities between items partitioned across the thread block.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockDiscontinuity::FlagTails(
+ FlagT (&tail_flags)[ITEMS_PER_THREAD],
+ T (&input)[ITEMS_PER_THREAD],
+ FlagOp flag_op,
+ T tile_successor_item
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The flag `tail_flags[i]` is set for item `input[i]` when `flag_op(input[i], next-item)` returns `true` (where `next-item` is either the next item in the same thread or the first item in the next thread).
+For *thread*BLOCK_THREADS - 1, item `input[ITEMS_PER_THREAD - 1]` is compared against `tile_successor_item`.
+Data is in a blocked arrangement across threads.
+Performance is sensitive to the degree of data movement across the block.
+
+
+
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Template parameters**
+
+
+**[inferred]** The number of consecutive items partitioned onto each thread.
+
+
+
+**[inferred]** The flag type (must be an integer type)
+
+
+
+**[inferred]** Binary predicate functor type having member `T operator()(const T &a, const T &b)` or member `T operator()(const T &a, const T &b, unsigned int b_index)`, and returning `true` if a discontinuity exists between `a` and `b`, otherwise `false`. `b_index` is the rank of `b` in the aggregate tile of data.
+
+
+**Parameters**
+
+
+Calling thread's discontinuity tail_flags
+
+
+
+Calling thread's input items
+
+
+
+Binary boolean flag predicate
+
+
+
+Embed:rst:leading-asterisk
+//! *thread*\ :sub:`BLOCK_THREADS - 1` only item with which to
+//! compare the last tile item (``input[ITEMS_PER_THREAD - 1]`` from
+//! *thread*\ :sub:`BLOCK_THREADS - 1`).
+//!
+
+
+**Example**
+
+The code snippet below illustrates the tail-flagging of 512 integer items that are partitioned in a blocked arrangement across 128 threads where each thread owns 4 consecutive items.
+
+`{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }` and that `tile_successor_item` is `125`. The corresponding output `tail_flags` in those threads will be `{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }`.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+__global__ void ExampleKernel(...)
+{
+ // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int
+ using BlockDiscontinuity = cub::BlockDiscontinuity;
+
+ // Allocate shared memory for BlockDiscontinuity
+ __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+
+ // Obtain a segment of consecutive items that are blocked across threads
+ int thread_data[4];
+ ...
+
+ // Have thread127 obtain the successor item for the entire tile
+ int tile_successor_item;
+ if (threadIdx.x == 127) tile_successor_item == ...
+
+ // Collectively compute tail flags for discontinuities in the segment
+ int tail_flags[4];
+ BlockDiscontinuity(temp_storage).FlagTails(tail_flags, thread_data,
+ cub::Inequality(), tile_successor_item);
+}
+```
+
+
+
+
+---
+
+## Head & tail flag operations
+
+### FlagHeadsAndTails inline
+
+
+
+
+Sets both head and tail flags indicating discontinuities between items partitioned across the thread block.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockDiscontinuity::FlagHeadsAndTails(
+ FlagT (&head_flags)[ITEMS_PER_THREAD],
+ FlagT (&tail_flags)[ITEMS_PER_THREAD],
+ T (&input)[ITEMS_PER_THREAD],
+ FlagOp flag_op
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The flag `head_flags[i]` is set for item `input[i]` when `flag_op(previous-item, input[i])` returns `true` (where `previous-item` is either the preceding item in the same thread or the last item in the previous thread).
+For *thread*0, item `input[0]` is always flagged.
+The flag `tail_flags[i]` is set for item `input[i]` when `flag_op(input[i], next-item)` returns `true` (where next-item is either the next item in the same thread or the first item in the next thread).
+For *thread*BLOCK_THREADS - 1, item `input[ITEMS_PER_THREAD - 1]` is always flagged.
+Data is in a blocked arrangement across threads.
+Performance is sensitive to the degree of data movement across the block.
+
+
+
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Template parameters**
+
+
+**[inferred]** The number of consecutive items partitioned onto each thread.
+
+
+
+**[inferred]** The flag type (must be an integer type)
+
+
+
+**[inferred]** Binary predicate functor type having member `T operator()(const T &a, const T &b)` or member `T operator()(const T &a, const T &b, unsigned int b_index)`, and returning `true` if a discontinuity exists between `a` and `b`, otherwise `false`. `b_index` is the rank of `b` in the aggregate tile of data.
+
+
+**Parameters**
+
+
+Calling thread's discontinuity head_flags
+
+
+
+Calling thread's discontinuity tail_flags
+
+
+
+Calling thread's input items
+
+
+
+Binary boolean flag predicate
+
+
+**Example**
+
+The code snippet below illustrates the head- and tail-flagging of 512 integer items that are partitioned in a blocked arrangement across 128 threads where each thread owns 4 consecutive items.
+
+`{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }` and that the tile_successor_item is `125`. The corresponding output `head_flags` in those threads will be `{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }`. and the corresponding output `tail_flags` in those threads will be `{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }`.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+__global__ void ExampleKernel(...)
+{
+ // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int
+ using BlockDiscontinuity = cub::BlockDiscontinuity;
+
+ // Allocate shared memory for BlockDiscontinuity
+ __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+
+ // Obtain a segment of consecutive items that are blocked across threads
+ int thread_data[4];
+ ...
+
+ // Collectively compute head and flags for discontinuities in the segment
+ int head_flags[4];
+ int tail_flags[4];
+ BlockDiscontinuity(temp_storage).FlagHeadsAndTails(head_flags, tail_flags, thread_data,
+ cub::Inequality());
+}
+```
+
+
+
+
+Sets both head and tail flags indicating discontinuities between items partitioned across the thread block.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockDiscontinuity::FlagHeadsAndTails(
+ FlagT (&head_flags)[ITEMS_PER_THREAD],
+ FlagT (&tail_flags)[ITEMS_PER_THREAD],
+ T tile_successor_item,
+ T (&input)[ITEMS_PER_THREAD],
+ FlagOp flag_op
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The flag `head_flags[i]` is set for item `input[i]` when `flag_op(previous-item, input[i])` returns `true` (where `previous-item` is either the preceding item in the same thread or the last item in the previous thread).
+For *thread*0, item `input[0]` is always flagged.
+The flag `tail_flags[i]` is set for item `input[i]` when `flag_op(input[i], next-item)` returns `true` (where `next-item` is either the next item in the same thread or the first item in the next thread).
+For *thread*BLOCK_THREADS - 1, item `input[ITEMS_PER_THREAD - 1]` is compared against `tile_predecessor_item`.
+Data is in a blocked arrangement across threads.
+Performance is sensitive to the degree of data movement across the block.
+
+
+
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Template parameters**
+
+
+**[inferred]** The number of consecutive items partitioned onto each thread.
+
+
+
+**[inferred]** The flag type (must be an integer type)
+
+
+
+**[inferred]** Binary predicate functor type having member `T operator()(const T &a, const T &b)` or member `T operator()(const T &a, const T &b, unsigned int b_index)`, and returning `true` if a discontinuity exists between `a` and `b`, otherwise `false`. `b_index` is the rank of b in the aggregate tile of data.
+
+
+**Parameters**
+
+
+Calling thread's discontinuity head_flags
+
+
+
+Calling thread's discontinuity tail_flags
+
+
+
+Embed:rst:leading-asterisk
+//! *thread*\ :sub:`BLOCK_THREADS - 1` only item with which to compare
+//! the last tile item (``input[ITEMS_PER_THREAD - 1]`` from
+//! *thread*\ :sub:`BLOCK_THREADS - 1`).
+//!
+
+
+
+Calling thread's input items
+
+
+
+Binary boolean flag predicate
+
+
+**Example**
+
+The code snippet below illustrates the head- and tail-flagging of 512 integer items that are partitioned in a blocked arrangement across 128 threads where each thread owns 4 consecutive items.
+
+`{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }` and that the tile_successor_item is `125`. The corresponding output `head_flags` in those threads will be `{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }`. and the corresponding output `tail_flags` in those threads will be `{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }`.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+__global__ void ExampleKernel(...)
+{
+ // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int
+ using BlockDiscontinuity = cub::BlockDiscontinuity;
+
+ // Allocate shared memory for BlockDiscontinuity
+ __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+
+ // Obtain a segment of consecutive items that are blocked across threads
+ int thread_data[4];
+ ...
+
+ // Have thread127 obtain the successor item for the entire tile
+ int tile_successor_item;
+ if (threadIdx.x == 127) tile_successor_item == ...
+
+ // Collectively compute head and flags for discontinuities in the segment
+ int head_flags[4];
+ int tail_flags[4];
+ BlockDiscontinuity(temp_storage).FlagHeadsAndTails(head_flags, tail_flags,
+ tile_successor_item, thread_data,
+ cub::Inequality());
+}
+```
+
+
+
+
+Sets both head and tail flags indicating discontinuities between items partitioned across the thread block.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockDiscontinuity::FlagHeadsAndTails(
+ FlagT (&head_flags)[ITEMS_PER_THREAD],
+ T tile_predecessor_item,
+ FlagT (&tail_flags)[ITEMS_PER_THREAD],
+ T (&input)[ITEMS_PER_THREAD],
+ FlagOp flag_op
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The flag `head_flags[i]` is set for item `input[i]` when `flag_op(previous-item, input[i])` returns `true` (where `previous-item` is either the preceding item in the same thread or the last item in the previous thread).
+For *thread*0, item `input[0]` is compared against `tile_predecessor_item`.
+The flag `tail_flags[i]` is set for item `input[i]` when `flag_op(input[i], next-item)` returns `true` (where `next-item` is either the next item in the same thread or the first item in the next thread).
+For *thread*BLOCK_THREADS - 1, item `input[ITEMS_PER_THREAD - 1]` is always flagged.
+Data is in a blocked arrangement across threads.
+Performance is sensitive to the degree of data movement across the block.
+
+
+
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Template parameters**
+
+
+**[inferred]** The number of consecutive items partitioned onto each thread.
+
+
+
+**[inferred]** The flag type (must be an integer type)
+
+
+
+**[inferred]** Binary predicate functor type having member `T operator()(const T &a, const T &b)` or member `T operator()(const T &a, const T &b, unsigned int b_index)`, and returning `true` if a discontinuity exists between `a` and `b`, otherwise `false`. `b_index` is the rank of b in the aggregate tile of data.
+
+
+**Parameters**
+
+
+Calling thread's discontinuity head_flags
+
+
+
+Embed:rst:leading-asterisk
+//! *thread*\ :sub:`0` only item with which to compare the first tile item (``input[0]`` from *thread*\ :sub:`0`).
+//!
+
+
+
+Calling thread's discontinuity tail_flags
+
+
+
+Calling thread's input items
+
+
+
+Binary boolean flag predicate
+
+
+**Example**
+
+The code snippet below illustrates the head- and tail-flagging of 512 integer items that are partitioned in a blocked arrangement across 128 threads where each thread owns 4 consecutive items.
+
+`{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }`, that the `tile_predecessor_item` is `0`, and that the `tile_successor_item` is `125`. The corresponding output `head_flags` in those threads will be `{ [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }`, and the corresponding output `tail_flags` in those threads will be `{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }`.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+__global__ void ExampleKernel(...)
+{
+ // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int
+ using BlockDiscontinuity = cub::BlockDiscontinuity;
+
+ // Allocate shared memory for BlockDiscontinuity
+ __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+
+ // Obtain a segment of consecutive items that are blocked across threads
+ int thread_data[4];
+ ...
+
+ // Have thread0 obtain the predecessor item for the entire tile
+ int tile_predecessor_item;
+ if (threadIdx.x == 0) tile_predecessor_item == ...
+
+ // Have thread127 obtain the successor item for the entire tile
+ int tile_successor_item;
+ if (threadIdx.x == 127) tile_successor_item == ...
+
+ // Collectively compute head and flags for discontinuities in the segment
+ int head_flags[4];
+ int tail_flags[4];
+ BlockDiscontinuity(temp_storage).FlagHeadsAndTails(head_flags, tile_predecessor_item,
+ tail_flags, tile_successor_item,
+ thread_data, cub::Inequality());
+}
+```
+
+
+
+
+Sets both head and tail flags indicating discontinuities between items partitioned across the thread block.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockDiscontinuity::FlagHeadsAndTails(
+ FlagT (&head_flags)[ITEMS_PER_THREAD],
+ T tile_predecessor_item,
+ FlagT (&tail_flags)[ITEMS_PER_THREAD],
+ T tile_successor_item,
+ T (&input)[ITEMS_PER_THREAD],
+ FlagOp flag_op
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The flag `head_flags[i]` is set for item `input[i]` when `flag_op(previous-item, input[i])` returns `true` (where `previous-item` is either the preceding item in the same thread or the last item in the previous thread).
+For *thread*0, item `input[0]` is compared against `tile_predecessor_item`.
+The flag `tail_flags[i]` is set for item `input[i]` when `flag_op(input[i], next-item)` returns `true` (where `next-item` is either the next item in the same thread or the first item in the next thread).
+For *thread*BLOCK_THREADS - 1, item `input[ITEMS_PER_THREAD - 1]` is compared against `tile_successor_item`.
+Data is in a blocked arrangement across threads.
+Performance is sensitive to the degree of data movement across the block.
+
+
+
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Template parameters**
+
+
+**[inferred]** The number of consecutive items partitioned onto each thread.
+
+
+
+**[inferred]** The flag type (must be an integer type)
+
+
+
+**[inferred]** Binary predicate functor type having member `T operator()(const T &a, const T &b)` or member `T operator()(const T &a, const T &b, unsigned int b_index)`, and returning `true` if a discontinuity exists between `a` and `b`, otherwise `false`. `b_index` is the rank of `b` in the aggregate tile of data.
+
+
+**Parameters**
+
+
+Calling thread's discontinuity head_flags
+
+
+
+Embed:rst:leading-asterisk
+//! *thread*\ :sub:`0` only item with which to compare the first tile item (``input[0]`` from *thread*\ :sub:`0`).
+//!
+
+
+
+Calling thread's discontinuity tail_flags
+
+
+
+Embed:rst:leading-asterisk
+//! *thread*\ :sub:`BLOCK_THREADS - 1` only item with which to compare the last tile item
+//! (``input[ITEMS_PER_THREAD - 1]`` from *thread*\ :sub:`BLOCK_THREADS - 1`).
+//!
+
+
+
+Calling thread's input items
+
+
+
+Binary boolean flag predicate
+
+
+**Example**
+
+The code snippet below illustrates the head- and tail-flagging of 512 integer items that are partitioned in a blocked arrangement across 128 threads where each thread owns 4 consecutive items.
+
+`{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }`, that the `tile_predecessor_item` is `0`, and that the `tile_successor_item` is `125`. The corresponding output `head_flags` in those threads will be `{ [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }`. and the corresponding output `tail_flags` in those threads will be `{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }`.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+__global__ void ExampleKernel(...)
+{
+ // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int
+ using BlockDiscontinuity = cub::BlockDiscontinuity;
+
+ // Allocate shared memory for BlockDiscontinuity
+ __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+
+ // Obtain a segment of consecutive items that are blocked across threads
+ int thread_data[4];
+ ...
+
+ // Have thread0 obtain the predecessor item for the entire tile
+ int tile_predecessor_item;
+ if (threadIdx.x == 0) tile_predecessor_item == ...
+
+ // Have thread127 obtain the successor item for the entire tile
+ int tile_successor_item;
+ if (threadIdx.x == 127) tile_successor_item == ...
+
+ // Collectively compute head and flags for discontinuities in the segment
+ int head_flags[4];
+ int tail_flags[4];
+ BlockDiscontinuity(temp_storage).FlagHeadsAndTails(head_flags, tile_predecessor_item,
+ tail_flags, tile_successor_item,
+ thread_data, cub::Inequality());
+}
+```
+
+
+
+
+---
+
+## Utility methods
+
+### PrivateStorage inline
+
+Internal storage allocator.
+
+
+```cpp showLineNumbers={false}
+_TempStorage & cub::BlockDiscontinuity::PrivateStorage()
+```
+
+
+---
+
+## Member variables
+
+| Name | Type | Description |
+|---|---|---|
+| `BLOCK_THREADS` static constexpr | `int` | The thread block size in threads. |
+| `temp_storage` | `_TempStorage &` | Shared storage reference. |
+| `linear_tid` | `unsigned int` | Linear thread-id. |
+
+---
+
+## Inner classes
+
+### _TempStorage
+
+
+```cpp showLineNumbers={false}
+struct cub::BlockDiscontinuity::_TempStorage
+```
+
+
+Shared memory storage layout type (last element from each thread's input).
+
+| Name | Type | Description |
+|---|---|---|
+| `first_items` | `T` | |
+| `last_items` | `T` | |
+
+### ApplyOp
+
+
+```cpp showLineNumbers={false}
+struct cub::BlockDiscontinuity::ApplyOp
+```
+
+
+Specialization for when FlagOp has third index param.
+
+### ApplyOp< FlagOp, false >
+
+
+```cpp showLineNumbers={false}
+struct cub::BlockDiscontinuity::ApplyOp< FlagOp, false >
+```
+
+
+Specialization for when FlagOp does not have a third index param.
+
+### Iterate
+
+
+```cpp showLineNumbers={false}
+struct cub::BlockDiscontinuity::Iterate
+```
+
+
+Templated unrolling of item comparison (inductive case).
+
+### TempStorage
+
+
+```cpp showLineNumbers={false}
+struct cub::BlockDiscontinuity::TempStorage
+```
+
+
+The operations exposed by `BlockDiscontinuity` require a temporary memory allocation of this nested type for thread communication. This opaque storage can be allocated directly using the `__shared__` keyword. Alternatively, it can be aliased to externally allocated memory (shared or global) or `union`'d with other storage allocation types to facilitate memory reuse.
+
+**Inherits from:** `Uninitialized< _TempStorage >` (public)
diff --git a/fern/cudapages/cub/cub/cub/BlockExchange.mdx b/fern/cudapages/cub/cub/cub/BlockExchange.mdx
new file mode 100644
index 0000000..f213ba7
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/BlockExchange.mdx
@@ -0,0 +1,972 @@
+---
+title: cub::BlockExchange
+description: ""
+---
+
+The BlockExchange class provides collective methods for rearranging data partitioned across a CUDA thread block.
+
+## Performance considerations
+
+- Proper device-specific padding ensures zero bank conflicts for most types.
+
+## Example
+
+The code snippet below illustrates the conversion from a "blocked" to a "striped" arrangement of 512 integer items partitioned across 128 threads where each thread owns 4 items.
+
+[1,129,257,385], ..., [127,255,383,511] }`. The corresponding output `thread_data`` in those threads will be `{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }`.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+__global__ void ExampleKernel(int *d_data, ...)
+{
+ // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
+ using BlockExchange = cub::BlockExchange;
+
+ // Allocate shared memory for BlockExchange
+ __shared__ typename BlockExchange::TempStorage temp_storage;
+
+ // Load a tile of data striped across threads
+ int thread_data[4];
+ cub::LoadDirectStriped<128>(threadIdx.x, d_data, thread_data);
+
+ // Collectively exchange data into a blocked arrangement across threads
+ BlockExchange(temp_storage).StripedToBlocked(thread_data);
+}
+```
+
+
+
+
+
+The data type to be exchanged
+
+
+
+The thread block length in threads along the X dimension
+
+
+
+The number of items partitioned onto each thread.
+
+
+
+**[optional]** When `true`, only use enough shared memory for a single warp's worth of tile data, time-slicing the block-wide exchange over multiple synchronized rounds. Yields a smaller memory footprint at the expense of decreased parallelism. (Default: false)
+
+
+
+**[optional]** The thread block length in threads along the Y dimension (default: 1)
+
+
+
+**[optional]** The thread block length in threads along the Z dimension (default: 1)
+
+
+
+
+
+---
+
+## Collective constructors
+
+### BlockExchange inline
+
+
+
+
+Collective constructor using a private static allocation of shared memory as temporary storage.
+
+
+```cpp showLineNumbers={false}
+cub::BlockExchange::BlockExchange()
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+
+
+Collective constructor using the specified memory allocation as temporary storage.
+
+
+```cpp showLineNumbers={false}
+cub::BlockExchange::BlockExchange(
+ TempStorage &temp_storage
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+**Parameters**
+
+
+Reference to memory allocation having layout type [TempStorage](/library/api/cub::_block_exchange::TempStorage)
+
+
+
+
+
+---
+
+## Structured exchanges
+
+### StripedToBlocked inline
+
+Transposes data items from **striped** arrangement to **blocked** arrangement.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockExchange::StripedToBlocked(
+ const T (&input_items)[ItemsPerThread],
+ OutputT (&output_items)[ItemsPerThread]
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Parameters**
+
+
+Items to exchange, converting between **striped** and **blocked** arrangements.
+
+
+
+Items from exchange, converting between **striped** and **blocked** arrangements.
+
+
+**Example**
+
+The code snippet below illustrates the conversion from a "striped" to a "blocked" arrangement of 512 integer items partitioned across 128 threads where each thread owns 4 items.
+
+[1,129,257,385], ..., [127,255,383,511] }`` after loading from device-accessible memory. The corresponding output `thread_data` in those threads will be `{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }`.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+__global__ void ExampleKernel(int *d_data, ...)
+{
+ // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
+ using BlockExchange = cub::BlockExchange;
+
+ // Allocate shared memory for BlockExchange
+ __shared__ typename BlockExchange::TempStorage temp_storage;
+
+ // Load a tile of ordered data into a striped arrangement across block threads
+ int thread_data[4];
+ cub::LoadDirectStriped<128>(threadIdx.x, d_data, thread_data);
+
+ // Collectively exchange data into a blocked arrangement across threads
+ BlockExchange(temp_storage).StripedToBlocked(thread_data, thread_data);
+}
+```
+
+### BlockedToStriped inline
+
+Transposes data items from **blocked** arrangement to **striped** arrangement.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockExchange::BlockedToStriped(
+ const T (&input_items)[ItemsPerThread],
+ OutputT (&output_items)[ItemsPerThread]
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Parameters**
+
+
+Items to exchange, converting between **striped** and **blocked** arrangements.
+
+
+
+Items from exchange, converting between **striped** and **blocked** arrangements.
+
+
+**Example**
+
+The code snippet below illustrates the conversion from a "blocked" to a "striped" arrangement of 512 integer items partitioned across 128 threads where each thread owns 4 items.
+
+[8,9,10,11], ..., [508,509,510,511] }`. The corresponding output `thread_data`` in those threads will be `{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }` in preparation for storing to device-accessible memory.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+__global__ void ExampleKernel(int *d_data, ...)
+{
+ // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
+ using BlockExchange = cub::BlockExchange;
+
+ // Allocate shared memory for BlockExchange
+ __shared__ typename BlockExchange::TempStorage temp_storage;
+
+ // Obtain a segment of consecutive items that are blocked across threads
+ int thread_data[4];
+ ...
+
+ // Collectively exchange data into a striped arrangement across threads
+ BlockExchange(temp_storage).BlockedToStriped(thread_data, thread_data);
+
+ // Store data striped across block threads into an ordered tile
+ cub::StoreDirectStriped(threadIdx.x, d_data, thread_data);
+}
+```
+
+### WarpStripedToBlocked inline
+
+Transposes data items from **warp-striped** arrangement to **blocked** arrangement.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockExchange::WarpStripedToBlocked(
+ const T (&input_items)[ItemsPerThread],
+ OutputT (&output_items)[ItemsPerThread]
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Parameters**
+
+
+Items to exchange, converting between **striped** and **blocked** arrangements.
+
+
+
+Items from exchange, converting between **striped** and **blocked** arrangements.
+
+
+**Example**
+
+The code snippet below illustrates the conversion from a "warp-striped" to a "blocked" arrangement of 512 integer items partitioned across 128 threads where each thread owns 4 items.
+
+[1,33,65,97], [2,34,66,98], ..., [415,447,479,511] }`` after loading from device-accessible memory. (The first 128 items are striped across the first warp of 32 threads, the second 128 items are striped across the second warp, etc.) The corresponding output `thread_data` in those threads will be `{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }`.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+__global__ void ExampleKernel(int *d_data, ...)
+{
+ // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
+ using BlockExchange = cub::BlockExchange;
+
+ // Allocate shared memory for BlockExchange
+ __shared__ typename BlockExchange::TempStorage temp_storage;
+
+ // Load a tile of ordered data into a warp-striped arrangement across warp threads
+ int thread_data[4];
+ cub::LoadSWarptriped(threadIdx.x, d_data, thread_data);
+
+ // Collectively exchange data into a blocked arrangement across threads
+ BlockExchange(temp_storage).WarpStripedToBlocked(thread_data);
+}
+```
+
+### BlockedToWarpStriped inline
+
+Transposes data items from **blocked** arrangement to **warp-striped** arrangement.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockExchange::BlockedToWarpStriped(
+ const T (&input_items)[ItemsPerThread],
+ OutputT (&output_items)[ItemsPerThread]
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Parameters**
+
+
+Items to exchange, converting between **striped** and **blocked** arrangements.
+
+
+
+Items from exchange, converting between **striped** and **blocked** arrangements.
+
+
+**Example**
+
+The code snippet below illustrates the conversion from a "blocked" to a "warp-striped" arrangement of 512 integer items partitioned across 128 threads where each thread owns 4 items.
+
+[8,9,10,11], ..., [508,509,510,511] }`. The corresponding output `thread_data`` in those threads will be `{ [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] }` in preparation for storing to device-accessible memory. (The first 128 items are striped across the first warp of 32 threads, the second 128 items are striped across the second warp, etc.)
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+__global__ void ExampleKernel(int *d_data, ...)
+{
+ // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
+ using BlockExchange = cub::BlockExchange;
+
+ // Allocate shared memory for BlockExchange
+ __shared__ typename BlockExchange::TempStorage temp_storage;
+
+ // Obtain a segment of consecutive items that are blocked across threads
+ int thread_data[4];
+ ...
+
+ // Collectively exchange data into a warp-striped arrangement across threads
+ BlockExchange(temp_storage).BlockedToWarpStriped(thread_data, thread_data);
+
+ // Store data striped across warp threads into an ordered tile
+ cub::StoreDirectStriped(threadIdx.x, d_data, thread_data);
+}
+```
+
+---
+
+## Scatter exchanges
+
+### ScatterToBlocked inline
+
+Exchanges data items annotated by rank into **blocked** arrangement.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockExchange::ScatterToBlocked(
+ const T (&input_items)[ItemsPerThread],
+ OutputT (&output_items)[ItemsPerThread],
+ OffsetT (&ranks)[ItemsPerThread]
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Template parameters**
+
+
+**[inferred]** Signed integer type for local offsets
+
+
+**Parameters**
+
+
+Items to exchange, converting between **striped** and **blocked** arrangements.
+
+
+
+Items from exchange, converting between **striped** and **blocked** arrangements.
+
+
+
+Corresponding scatter ranks
+
+
+### ScatterToStriped inline
+
+Exchanges data items annotated by rank into **striped** arrangement.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockExchange::ScatterToStriped(
+ const T (&input_items)[ItemsPerThread],
+ OutputT (&output_items)[ItemsPerThread],
+ OffsetT (&ranks)[ItemsPerThread]
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Template parameters**
+
+
+**[inferred]** Signed integer type for local offsets
+
+
+**Parameters**
+
+
+Items to exchange, converting between **striped** and **blocked** arrangements.
+
+
+
+Items from exchange, converting between **striped** and **blocked** arrangements.
+
+
+
+Corresponding scatter ranks
+
+
+### ScatterToStripedGuarded inline
+
+Exchanges data items annotated by rank into **striped** arrangement. Items with rank -1 are not exchanged.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockExchange::ScatterToStripedGuarded(
+ const T (&input_items)[ItemsPerThread],
+ OutputT (&output_items)[ItemsPerThread],
+ OffsetT (&ranks)[ItemsPerThread]
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Template parameters**
+
+
+**[inferred]** Signed integer type for local offsets
+
+
+**Parameters**
+
+
+Items to exchange, converting between **striped** and **blocked** arrangements.
+
+
+
+Items from exchange, converting between **striped** and **blocked** arrangements.
+
+
+
+Corresponding scatter ranks
+
+
+### ScatterToStripedFlagged inline
+
+Exchanges valid data items annotated by rank into **striped** arrangement.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockExchange::ScatterToStripedFlagged(
+ const T (&input_items)[ItemsPerThread],
+ OutputT (&output_items)[ItemsPerThread],
+ OffsetT (&ranks)[ItemsPerThread],
+ ValidFlag (&is_valid)[ItemsPerThread]
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Template parameters**
+
+
+**[inferred]** Signed integer type for local offsets
+
+
+
+**[inferred]** FlagT type denoting which items are valid
+
+
+**Parameters**
+
+
+Items to exchange, converting between **striped** and **blocked** arrangements.
+
+
+
+Items from exchange, converting between **striped** and **blocked** arrangements.
+
+
+
+Corresponding scatter ranks
+
+
+
+Corresponding flag denoting item validity
+
+
+---
+
+## Utility methods
+
+### PrivateStorage inline
+
+Internal storage allocator.
+
+
+```cpp showLineNumbers={false}
+_TempStorage & cub::BlockExchange::PrivateStorage()
+```
+
+
+### BlockedToStriped inline
+
+
+
+
+Transposes data items from **blocked** arrangement to **striped** arrangement.
+
+Specialized for no timeslicing.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockExchange::BlockedToStriped(
+ const T (&input_items)[ItemsPerThread],
+ OutputT (&output_items)[ItemsPerThread],
+ ::cuda::std::false_type
+)
+```
+
+
+**Parameters**
+
+
+Items to exchange, converting between **blocked** and **striped** arrangements.
+
+
+
+Items to exchange, converting between **blocked** and **striped** arrangements.
+
+
+
+
+
+Transposes data items from **blocked** arrangement to **striped** arrangement.
+
+Specialized for warp-timeslicing.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockExchange::BlockedToStriped(
+ const T (&input_items)[ItemsPerThread],
+ OutputT (&output_items)[ItemsPerThread],
+ ::cuda::std::true_type
+)
+```
+
+
+**Parameters**
+
+
+Items to exchange, converting between **blocked** and **striped** arrangements.
+
+
+
+Items to exchange, converting between **blocked** and **striped** arrangements.
+
+
+
+
+
+### BlockedToWarpStriped inline
+
+
+
+
+Transposes data items from **blocked** arrangement to **warp-striped** arrangement.
+
+Specialized for no timeslicing
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockExchange::BlockedToWarpStriped(
+ const T (&input_items)[ItemsPerThread],
+ OutputT (&output_items)[ItemsPerThread],
+ ::cuda::std::false_type
+)
+```
+
+
+**Parameters**
+
+
+Items to exchange, converting between **blocked** and **striped** arrangements.
+
+
+
+Items to exchange, converting between **blocked** and **striped** arrangements.
+
+
+
+
+
+Transposes data items from **blocked** arrangement to **warp-striped** arrangement.
+
+Specialized for warp-timeslicing
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockExchange::BlockedToWarpStriped(
+ const T (&input_items)[ItemsPerThread],
+ OutputT (&output_items)[ItemsPerThread],
+ ::cuda::std::true_type
+)
+```
+
+
+**Parameters**
+
+
+Items to exchange, converting between **blocked** and **striped** arrangements.
+
+
+
+Items to exchange, converting between **blocked** and **striped** arrangements.
+
+
+
+
+
+### StripedToBlocked inline
+
+
+
+
+Transposes data items from **striped** arrangement to **blocked** arrangement.
+
+Specialized for no timeslicing.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockExchange::StripedToBlocked(
+ const T (&input_items)[ItemsPerThread],
+ OutputT (&output_items)[ItemsPerThread],
+ ::cuda::std::false_type
+)
+```
+
+
+**Parameters**
+
+
+Items to exchange, converting between **blocked** and **striped** arrangements.
+
+
+
+Items to exchange, converting between **blocked** and **striped** arrangements.
+
+
+
+
+
+Transposes data items from **striped** arrangement to **blocked** arrangement.
+
+Specialized for warp-timeslicing.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockExchange::StripedToBlocked(
+ const T (&input_items)[ItemsPerThread],
+ OutputT (&output_items)[ItemsPerThread],
+ ::cuda::std::true_type
+)
+```
+
+
+**Parameters**
+
+
+Items to exchange, converting between **blocked** and **striped** arrangements.
+
+
+
+Items to exchange, converting between **blocked** and **striped** arrangements.
+
+
+
+
+
+### WarpStripedToBlocked inline
+
+
+
+
+Transposes data items from **warp-striped** arrangement to **blocked** arrangement.
+
+Specialized for no timeslicing
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockExchange::WarpStripedToBlocked(
+ const T (&input_items)[ItemsPerThread],
+ OutputT (&output_items)[ItemsPerThread],
+ ::cuda::std::false_type
+)
+```
+
+
+**Parameters**
+
+
+Items to exchange, converting between **blocked** and **striped** arrangements.
+
+
+
+Items to exchange, converting between **blocked** and **striped** arrangements.
+
+
+
+
+
+Transposes data items from **warp-striped** arrangement to **blocked** arrangement.
+
+Specialized for warp-timeslicing
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockExchange::WarpStripedToBlocked(
+ const T (&input_items)[ItemsPerThread],
+ OutputT (&output_items)[ItemsPerThread],
+ ::cuda::std::true_type
+)
+```
+
+
+**Parameters**
+
+
+Items to exchange, converting between **blocked** and **striped** arrangements.
+
+
+
+Items to exchange, converting between **blocked** and **striped** arrangements.
+
+
+
+
+
+### ScatterToBlocked inline
+
+
+
+
+Exchanges data items annotated by rank into **blocked** arrangement.
+
+Specialized for no timeslicing.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockExchange::ScatterToBlocked(
+ const T (&input_items)[ItemsPerThread],
+ OutputT (&output_items)[ItemsPerThread],
+ OffsetT (&ranks)[ItemsPerThread],
+ ::cuda::std::false_type
+)
+```
+
+
+**Parameters**
+
+
+Items to exchange, converting between **blocked** and **striped** arrangements.
+
+
+
+Items to exchange, converting between **blocked** and **striped** arrangements.
+
+
+
+Corresponding scatter ranks
+
+
+
+
+
+Exchanges data items annotated by rank into **blocked** arrangement.
+
+Specialized for warp-timeslicing.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockExchange::ScatterToBlocked(
+ const T (&input_items)[ItemsPerThread],
+ OutputT (&output_items)[ItemsPerThread],
+ OffsetT ranks[ItemsPerThread],
+ ::cuda::std::true_type
+)
+```
+
+
+**Parameters**
+
+
+Items to exchange, converting between **blocked** and **striped** arrangements.
+
+
+
+Items to exchange, converting between **blocked** and **striped** arrangements.
+
+
+
+Corresponding scatter ranks
+
+
+
+
+
+### ScatterToStriped inline
+
+
+
+
+Exchanges data items annotated by rank into **striped** arrangement.
+
+Specialized for no timeslicing.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockExchange::ScatterToStriped(
+ const T (&input_items)[ItemsPerThread],
+ OutputT (&output_items)[ItemsPerThread],
+ OffsetT (&ranks)[ItemsPerThread],
+ ::cuda::std::false_type
+)
+```
+
+
+**Parameters**
+
+
+Items to exchange, converting between **blocked** and **striped** arrangements.
+
+
+
+Items to exchange, converting between **blocked** and **striped** arrangements.
+
+
+
+Corresponding scatter ranks
+
+
+
+
+
+Exchanges data items annotated by rank into **striped** arrangement.
+
+Specialized for warp-timeslicing.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockExchange::ScatterToStriped(
+ const T (&input_items)[ItemsPerThread],
+ OutputT (&output_items)[ItemsPerThread],
+ OffsetT (&ranks)[ItemsPerThread],
+ ::cuda::std::true_type
+)
+```
+
+
+**Parameters**
+
+
+Items to exchange, converting between **blocked** and **striped** arrangements.
+
+
+
+Items to exchange, converting between **blocked** and **striped** arrangements.
+
+
+
+Corresponding scatter ranks
+
+
+
+
+
+---
+
+## Types
+
+### Typedefs
+
+| Name | Definition | Description |
+|---|---|---|
+| `TempStorage` | `Uninitialized< _TempStorage >` | The operations exposed by `BlockExchange` require a temporary memory allocation of this nested type for thread communication. This opaque storage can be allocated directly using the `__shared__` keyword. Alternatively, it can be aliased to externally allocated memory (shared or global) or `union`'d with other storage allocation types to facilitate memory reuse. |
+
+---
+
+## Member variables
+
+| Name | Type | Description |
+|---|---|---|
+| `BLOCK_THREADS` static constexpr | `int` | The thread block size in threads. |
+| `WARP_THREADS` static constexpr | `int` | |
+| `WARPS` static constexpr | `int` | |
+| `LOG_SMEM_BANKS` static constexpr | `int` | |
+| `TILE_ITEMS` static constexpr | `int` | |
+| `TIME_SLICES` static constexpr | `int` | |
+| `TIME_SLICED_THREADS` static constexpr | `int` | |
+| `TIME_SLICED_ITEMS` static constexpr | `int` | |
+| `WARP_TIME_SLICED_THREADS` static constexpr | `int` | |
+| `WARP_TIME_SLICED_ITEMS` static constexpr | `int` | |
+| `INSERT_PADDING` static constexpr | `bool` | |
+| `PADDING_ITEMS` static constexpr | `int` | |
+| `temp_storage` | `_TempStorage &` | |
+| `linear_tid` | `unsigned int` | |
+| `lane_id` | `unsigned int` | |
+| `warp_id` | `unsigned int` | |
+| `warp_offset` | `unsigned int` | |
+
+---
+
+## Inner classes
+
+### _TempStorage
+
+
+```cpp showLineNumbers={false}
+struct cub::BlockExchange::_TempStorage
+```
+
+
+Shared memory storage layout type.
+
+| Name | Type | Description |
+|---|---|---|
+| `buff` | `T` | |
diff --git a/fern/cudapages/cub/cub/cub/BlockHistogram.mdx b/fern/cudapages/cub/cub/cub/BlockHistogram.mdx
new file mode 100644
index 0000000..075d741
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/BlockHistogram.mdx
@@ -0,0 +1,360 @@
+---
+title: cub::BlockHistogram
+description: ""
+---
+
+The BlockHistogram class provides collective methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
+
+## Performance considerations
+
+- Performance is sensitive to the degree of data movement across the block.
+- All input values must fall between `[0, Bins)`, or behavior is undefined.
+- The histogram output can be constructed in shared or device-accessible memory
+- See `cub::BlockHistogramAlgorithm` for performance details regarding algorithmic alternatives
+
+## Example
+
+The code snippet below illustrates a 256-bin histogram of 512 integer samples that are partitioned across 128 threads where each thread owns 4 samples.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+__global__ void ExampleKernel(...)
+{
+ // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each
+ using BlockHistogram = cub::BlockHistogram;
+
+ // Allocate shared memory for BlockHistogram
+ __shared__ typename BlockHistogram::TempStorage temp_storage;
+
+ // Allocate shared memory for block-wide histogram bin counts
+ __shared__ unsigned int smem_histogram[256];
+
+ // Obtain input samples per thread
+ unsigned char data[4];
+ ...
+
+ // Compute the block-wide histogram
+ BlockHistogram(temp_storage).Histogram(data, smem_histogram);
+}
+```
+
+
+
+
+
+The sample type being histogrammed (must be castable to an integer bin identifier)
+
+
+
+The thread block length in threads along the X dimension
+
+
+
+The number of items per thread
+
+
+
+The number bins within the histogram
+
+
+
+**[optional]** [cub::BlockHistogramAlgorithm](/library/api/cub::BlockHistogramAlgorithm) enumerator specifying the underlying algorithm to use (default: [cub::BLOCK_HISTO_SORT](/library/api/cub::BLOCK_HISTO_SORT))
+
+
+
+**[optional]** The thread block length in threads along the Y dimension (default: 1)
+
+
+
+**[optional]** The thread block length in threads along the Z dimension (default: 1)
+
+
+
+
+
+---
+
+## Collective constructors
+
+### BlockHistogram inline
+
+
+
+
+Collective constructor using a private static allocation of shared memory as temporary storage.
+
+
+```cpp showLineNumbers={false}
+cub::BlockHistogram::BlockHistogram()
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+
+
+Collective constructor using the specified memory allocation as temporary storage.
+
+
+```cpp showLineNumbers={false}
+cub::BlockHistogram::BlockHistogram(
+ TempStorage &temp_storage
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+**Parameters**
+
+
+Reference to memory allocation having layout type [TempStorage](/library/api/cub::BlockHistogram::TempStorage)
+
+
+
+
+
+---
+
+## Histogram operations
+
+### InitHistogram inline
+
+Initialize the shared histogram counters to zero.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockHistogram::InitHistogram(
+ CounterT histogram[Bins]
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+**Template parameters**
+
+
+**[inferred]** Histogram counter type
+
+
+**Example**
+
+The code snippet below illustrates a the initialization and update of a histogram of 512 integer samples that are partitioned across 128 threads where each thread owns 4 samples.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+__global__ void ExampleKernel(...)
+{
+ // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each
+ using BlockHistogram = cub::BlockHistogram;
+
+ // Allocate shared memory for BlockHistogram
+ __shared__ typename BlockHistogram::TempStorage temp_storage;
+
+ // Allocate shared memory for block-wide histogram bin counts
+ __shared__ unsigned int smem_histogram[256];
+
+ // Obtain input samples per thread
+ unsigned char thread_samples[4];
+ ...
+
+ // Initialize the block-wide histogram
+ BlockHistogram(temp_storage).InitHistogram(smem_histogram);
+
+ // Update the block-wide histogram
+ BlockHistogram(temp_storage).Composite(thread_samples, smem_histogram);
+}
+```
+
+### Histogram inline
+
+Constructs a block-wide histogram in shared/device-accessible memory. Each thread contributes an array of input elements.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockHistogram::Histogram(
+ T (&items)[ItemsPerThread],
+ CounterT histogram[Bins]
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+Performance is sensitive to the degree of data movement across the block.
+
+
+
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Template parameters**
+
+
+**[inferred]** Histogram counter type
+
+
+**Parameters**
+
+
+Calling thread's input values to histogram
+
+
+
+Reference to shared/device-accessible memory histogram
+
+
+**Example**
+
+The code snippet below illustrates a 256-bin histogram of 512 integer samples that are partitioned across 128 threads where each thread owns 4 samples.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+__global__ void ExampleKernel(...)
+{
+ // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each
+ using BlockHistogram = cub::BlockHistogram;
+
+ // Allocate shared memory for BlockHistogram
+ __shared__ typename BlockHistogram::TempStorage temp_storage;
+
+ // Allocate shared memory for block-wide histogram bin counts
+ __shared__ unsigned int smem_histogram[256];
+
+ // Obtain input samples per thread
+ unsigned char thread_samples[4];
+ ...
+
+ // Compute the block-wide histogram
+ BlockHistogram(temp_storage).Histogram(thread_samples, smem_histogram);
+}
+```
+
+### Composite inline
+
+Updates an existing block-wide histogram in shared/device-accessible memory. Each thread composites an array of input elements.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockHistogram::Composite(
+ T (&items)[ItemsPerThread],
+ CounterT histogram[Bins]
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+Performance is sensitive to the degree of data movement across the block.
+
+
+
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Template parameters**
+
+
+**[inferred]** Histogram counter type
+
+
+**Parameters**
+
+
+Calling thread's input values to histogram
+
+
+
+Reference to shared/device-accessible memory histogram
+
+
+**Example**
+
+The code snippet below illustrates a the initialization and update of a histogram of 512 integer samples that are partitioned across 128 threads where each thread owns 4 samples.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+__global__ void ExampleKernel(...)
+{
+ // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each
+ using BlockHistogram = cub::BlockHistogram;
+
+ // Allocate shared memory for BlockHistogram
+ __shared__ typename BlockHistogram::TempStorage temp_storage;
+
+ // Allocate shared memory for block-wide histogram bin counts
+ __shared__ unsigned int smem_histogram[256];
+
+ // Obtain input samples per thread
+ unsigned char thread_samples[4];
+ ...
+
+ // Initialize the block-wide histogram
+ BlockHistogram(temp_storage).InitHistogram(smem_histogram);
+
+ // Update the block-wide histogram
+ BlockHistogram(temp_storage).Composite(thread_samples, smem_histogram);
+}
+```
+
+---
+
+## Utility methods
+
+### PrivateStorage inline
+
+Internal storage allocator.
+
+
+```cpp showLineNumbers={false}
+_TempStorage & cub::BlockHistogram::PrivateStorage()
+```
+
+
+---
+
+## Types
+
+### Typedefs
+
+| Name | Definition | Description |
+|---|---|---|
+| `InternalBlockHistogram` | `::cuda::std::_If< Algorithm==BLOCK_HISTO_SORT, detail::BlockHistogramSort< T, BlockDimX, ItemsPerThread, Bins, BlockDimY, BlockDimZ >, detail::BlockHistogramAtomic< Bins > >` | Internal specialization. |
+| `_TempStorage` | `typename InternalBlockHistogram::TempStorage` | Shared memory storage layout type for `BlockHistogram`. |
+
+---
+
+## Member variables
+
+| Name | Type | Description |
+|---|---|---|
+| `BLOCK_THREADS` static constexpr | `int` | The thread block size in threads. |
+| `temp_storage` | `_TempStorage &` | Shared storage reference. |
+| `linear_tid` | `unsigned int` | Linear thread-id. |
+
+---
+
+## Inner classes
+
+### TempStorage
+
+
+```cpp showLineNumbers={false}
+struct cub::BlockHistogram::TempStorage
+```
+
+
+The operations exposed by `BlockHistogram` require a temporary memory allocation of this nested type for thread communication. This opaque storage can be allocated directly using the `__shared__` keyword. Alternatively, it can be aliased to externally allocated memory (shared or global) or `union`'d with other storage allocation types to facilitate memory reuse.
+
+**Inherits from:** `Uninitialized< _TempStorage >` (public)
diff --git a/fern/cudapages/cub/cub/cub/BlockLoad.mdx b/fern/cudapages/cub/cub/cub/BlockLoad.mdx
new file mode 100644
index 0000000..17184e9
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/BlockLoad.mdx
@@ -0,0 +1,419 @@
+---
+title: cub::BlockLoad
+description: ""
+---
+
+The BlockLoad class provides collective data movement methods for loading a linear segment of items from memory into a blocked arrangement across a CUDA thread block.
+
+## Example
+
+The code snippet below illustrates the loading of a linear segment of 512 integers into a "blocked" arrangement across 128 threads where each thread owns 4 consecutive items. The load is specialized for `BLOCK_LOAD_WARP_TRANSPOSE`, meaning memory references are efficiently coalesced using a warp-striped access pattern (after which items are locally reordered among threads).
+
+those threads will be `{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }`.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+__global__ void ExampleKernel(int *d_data, ...)
+{
+ // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
+ using BlockLoad = cub::BlockLoad;
+
+ // Allocate shared memory for BlockLoad
+ __shared__ typename BlockLoad::TempStorage temp_storage;
+
+ // Load a segment of consecutive items that are blocked across threads
+ int thread_data[4];
+ BlockLoad(temp_storage).Load(d_data, thread_data);
+}
+```
+
+
+
+
+
+The data type to read into (which must be convertible from the input iterator's value type).
+
+
+
+
+
+
+The number of consecutive items partitioned onto each thread.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+
+## Collective constructors
+
+### BlockLoad inline
+
+
+
+
+Collective constructor using a private static allocation of shared memory as temporary storage.
+
+
+```cpp showLineNumbers={false}
+cub::BlockLoad::BlockLoad()
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+
+
+Collective constructor using the specified memory allocation as temporary storage.
+
+
+```cpp showLineNumbers={false}
+cub::BlockLoad::BlockLoad(
+ TempStorage &temp_storage
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+**Parameters**
+
+
+Reference to memory allocation having layout type [TempStorage](/library/api/cub::_block_load::TempStorage)
+
+
+
+
+
+---
+
+## Data movement
+
+### Load inline
+
+
+
+
+Load a linear segment of items from memory.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockLoad::Load(
+ RandomAccessIterator block_src_it,
+ T (&dst_items)[ItemsPerThread]
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+Data is in a blocked arrangement across threads.
+
+
+
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Parameters**
+
+
+The thread block's base iterator for loading from
+
+
+
+Destination to load data into
+
+
+**Example**
+
+The code snippet below illustrates the loading of a linear segment of 512 integers into a "blocked" arrangement across 128 threads where each thread owns 4 consecutive items. The load is specialized for `BLOCK_LOAD_WARP_TRANSPOSE`, meaning memory references are efficiently coalesced using a warp-striped access pattern (after which items are locally reordered among threads).
+
+in those threads will be `{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }`.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+__global__ void ExampleKernel(int *d_data, ...)
+{
+ // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
+ using BlockLoad = cub::BlockLoad;
+
+ // Allocate shared memory for BlockLoad
+ __shared__ typename BlockLoad::TempStorage temp_storage;
+
+ // Load a segment of consecutive items that are blocked across threads
+ int thread_data[4];
+ BlockLoad(temp_storage).Load(d_data, thread_data);
+}
+```
+
+
+
+
+Load a linear segment of items from memory, guarded by range.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockLoad::Load(
+ RandomAccessIterator block_src_it,
+ T (&dst_items)[ItemsPerThread],
+ int block_items_end
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+Data is in a blocked arrangement across threads.
+
+
+
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Parameters**
+
+
+The thread block's base iterator for loading from
+
+
+
+Destination to load data into
+
+
+
+Number of valid items to load
+
+
+**Example**
+
+The code snippet below illustrates the guarded loading of a linear segment of 512 integers into a "blocked" arrangement across 128 threads where each thread owns 4 consecutive items. The load is specialized for `BLOCK_LOAD_WARP_TRANSPOSE`, meaning memory references are efficiently coalesced using a warp-striped access pattern (after which items are locally reordered among threads).
+
+`thread_data` across the block of threads in those threads will be `{ [0,1,2,3], [4,?,?,?], ..., [?,?,?,?] }`, with only the first two threads being unmasked to load portions of valid data (and other items remaining unassigned).
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+__global__ void ExampleKernel(int *d_data, int block_items_end, ...)
+{
+ // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
+ using BlockLoad = cub::BlockLoad;
+
+ // Allocate shared memory for BlockLoad
+ __shared__ typename BlockLoad::TempStorage temp_storage;
+
+ // Load a segment of consecutive items that are blocked across threads
+ int thread_data[4];
+ BlockLoad(temp_storage).Load(d_data, thread_data, block_items_end);
+}
+```
+
+
+
+
+Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockLoad::Load(
+ RandomAccessIterator block_src_it,
+ T (&dst_items)[ItemsPerThread],
+ int block_items_end,
+ DefaultT oob_default
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+Data is in a blocked arrangement across threads.
+
+
+
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Parameters**
+
+
+The thread block's base iterator for loading from
+
+
+
+Destination to load data into
+
+
+
+Number of valid items to load
+
+
+
+Default value to assign out-of-bound items
+
+
+**Example**
+
+The code snippet below illustrates the guarded loading of a linear segment of 512 integers into a "blocked" arrangement across 128 threads where each thread owns 4 consecutive items. The load is specialized for `BLOCK_LOAD_WARP_TRANSPOSE`, meaning memory references are efficiently coalesced using a warp-striped access pattern (after which items are locally reordered among threads).
+
+default is `-1`. The set of `thread_data` across the block of threads in those threads will be `{ [0,1,2,3], [4,-1,-1,-1], ..., [-1,-1,-1,-1] }`, with only the first two threads being unmasked to load portions of valid data (and other items are assigned `-1`)
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+__global__ void ExampleKernel(int *d_data, int block_items_end, ...)
+{
+ // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
+ using BlockLoad = cub::BlockLoad;
+
+ // Allocate shared memory for BlockLoad
+ __shared__ typename BlockLoad::TempStorage temp_storage;
+
+ // Load a segment of consecutive items that are blocked across threads
+ int thread_data[4];
+ BlockLoad(temp_storage).Load(d_data, thread_data, block_items_end, -1);
+}
+```
+
+
+
+
+---
+
+## Utility methods
+
+### PrivateStorage inline
+
+
+```cpp showLineNumbers={false}
+_TempStorage & cub::BlockLoad::PrivateStorage()
+```
+
+
+---
+
+## Types
+
+### Typedefs
+
+| Name | Definition | Description |
+|---|---|---|
+| `InternalLoad` | `LoadInternal< Algorithm, 0 >` | |
+| `_TempStorage` | `typename InternalLoad::TempStorage` | |
+| `TempStorage` | `Uninitialized< _TempStorage >` | The operations exposed by `BlockLoad` require a temporary memory allocation of this nested type for thread communication. This opaque storage can be allocated directly using the `__shared__` keyword. Alternatively, it can be aliased to externally allocated memory (shared or global) or `union`'d with other storage allocation types to facilitate memory reuse. |
+
+---
+
+## Member variables
+
+| Name | Type | Description |
+|---|---|---|
+| `BlockThreads` static constexpr | `int` | |
+| `temp_storage` | `_TempStorage &` | |
+| `linear_tid` | `int` | |
+
+---
+
+## Inner classes
+
+### LoadInternal
+
+
+```cpp showLineNumbers={false}
+struct cub::BlockLoad::LoadInternal
+```
+
+
+### LoadInternal< BLOCK_LOAD_DIRECT, Dummy >
+
+
+```cpp showLineNumbers={false}
+struct cub::BlockLoad::LoadInternal< BLOCK_LOAD_DIRECT, Dummy >
+```
+
+
+| Name | Type | Description |
+|---|---|---|
+| `linear_tid` | `int` | |
+
+### LoadInternal< BLOCK_LOAD_STRIPED, Dummy >
+
+
+```cpp showLineNumbers={false}
+struct cub::BlockLoad::LoadInternal< BLOCK_LOAD_STRIPED, Dummy >
+```
+
+
+| Name | Type | Description |
+|---|---|---|
+| `linear_tid` | `int` | |
+
+### LoadInternal< BLOCK_LOAD_VECTORIZE, Dummy >
+
+
+```cpp showLineNumbers={false}
+struct cub::BlockLoad::LoadInternal< BLOCK_LOAD_VECTORIZE, Dummy >
+```
+
+
+| Name | Type | Description |
+|---|---|---|
+| `linear_tid` | `int` | |
+
+### LoadInternal< BLOCK_LOAD_TRANSPOSE, Dummy >
+
+
+```cpp showLineNumbers={false}
+struct cub::BlockLoad::LoadInternal< BLOCK_LOAD_TRANSPOSE, Dummy >
+```
+
+
+| Name | Type | Description |
+|---|---|---|
+| `temp_storage` | `_TempStorage &` | |
+| `linear_tid` | `int` | |
+
+### LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE, Dummy >
+
+
+```cpp showLineNumbers={false}
+struct cub::BlockLoad::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE, Dummy >
+```
+
+
+| Name | Type | Description |
+|---|---|---|
+| `WARP_THREADS` static constexpr | `int` | |
+| `temp_storage` | `_TempStorage &` | |
+| `linear_tid` | `int` | |
+
+### LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, Dummy >
+
+
+```cpp showLineNumbers={false}
+struct cub::BlockLoad::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, Dummy >
+```
+
+
+| Name | Type | Description |
+|---|---|---|
+| `WARP_THREADS` static constexpr | `int` | |
+| `temp_storage` | `_TempStorage &` | |
+| `linear_tid` | `int` | |
diff --git a/fern/cudapages/cub/cub/cub/BlockLoadType.mdx b/fern/cudapages/cub/cub/cub/BlockLoadType.mdx
new file mode 100644
index 0000000..ba43cc8
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/BlockLoadType.mdx
@@ -0,0 +1,29 @@
+---
+title: cub::BlockLoadType
+description: ""
+---
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+
+## Types
+
+### Typedefs
+
+| Name | Definition |
+|---|---|
+| `type` | `cub::BlockLoad< T, Policy::BLOCK_THREADS, Policy::ITEMS_PER_THREAD, Policy::LOAD_ALGORITHM >` |
diff --git a/fern/cudapages/cub/cub/cub/BlockMergeSort.mdx b/fern/cudapages/cub/cub/cub/BlockMergeSort.mdx
new file mode 100644
index 0000000..8ae5e16
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/BlockMergeSort.mdx
@@ -0,0 +1,224 @@
+---
+title: cub::BlockMergeSort
+description: "The [BlockMergeSort](/library/api/cub::_block_merge_sort) class provides methods for sorting items partitioned across a CUDA thread block using a merge sorting method."
+---
+
+The `BlockMergeSort` class provides methods for sorting items partitioned across a CUDA thread block using a merge sorting method.
+
+**Overview**
+
+`BlockMergeSort` arranges items into ascending order using a comparison functor with less-than semantics. Merge sort can handle arbitrary types and comparison functors, but is slower than [BlockRadixSort](/library/api/cub::_block_radix_sort) when sorting arithmetic types into ascending/descending order.
+
+**A Simple Example**
+
+Every thread in the block uses the `BlockMergeSort` class by first specializing the `BlockMergeSort` type, then instantiating an instance with parameters for communication, and finally invoking one or more collective member functions.
+
+The code snippet below illustrates a sort of 512 integer keys that are partitioned across 128 threads * where each thread owns 4 consecutive items.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+struct CustomLess
+{
+ template
+ __device__ bool operator()(const DataType &lhs, const DataType &rhs)
+ {
+ return lhs < rhs;
+ }
+};
+
+__global__ void ExampleKernel(...)
+{
+ // Specialize BlockMergeSort for a 1D block of 128 threads owning 4 integer items each
+ using BlockMergeSort = cub::BlockMergeSort;
+
+ // Allocate shared memory for BlockMergeSort
+ __shared__ typename BlockMergeSort::TempStorage temp_storage_shuffle;
+
+ // Obtain a segment of consecutive items that are blocked across threads
+ int thread_keys[4];
+ ...
+
+ BlockMergeSort(temp_storage_shuffle).Sort(thread_keys, CustomLess());
+ ...
+}
+```
+
+Suppose the set of input `thread_keys` across the block of threads is `{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }`. The corresponding output `thread_keys` in those threads will be `{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }`.
+
+**Re-using dynamically allocating shared memory**
+
+The `block/example_block_reduce_dyn_smem.cu` example illustrates usage of dynamically shared memory with [BlockReduce](/library/api/cub::_block_reduce) and how to re-purpose the same memory region.
+
+This example can be easily adapted to the storage required by `BlockMergeSort`.
+
+
+
+
+
+KeyT type
+
+
+
+
+
+
+The number of items per thread
+
+
+
+**[optional]** ValueT type (default: `cub::NullType`, which indicates a keys-only sort)
+
+
+
+
+
+
+
+
+
+
+
+**Inherits from:** `cub::BlockMergeSortStrategy< KeyT, NullType, BlockDimX *1 *1, ItemsPerThread, BlockMergeSort< KeyT, BlockDimX, ItemsPerThread, NullType, 1, 1 > >` (public)
+
+---
+
+## Constructors
+
+### BlockMergeSort inline
+
+
+
+
+
+```cpp showLineNumbers={false}
+cub::BlockMergeSort::BlockMergeSort()
+```
+
+
+
+
+
+explicit
+
+
+```cpp showLineNumbers={false}
+cub::BlockMergeSort::BlockMergeSort(
+ typename BlockMergeSortStrategyT::TempStorage &temp_storage
+)
+```
+
+
+
+
+
+---
+
+## Methods
+
+### get_linear_tid inline const
+
+
+```cpp showLineNumbers={false}
+unsigned int cub::BlockMergeSortStrategy>::get_linear_tid() const
+```
+
+
+### Sort inline
+
+Sorts items partitioned across a CUDA thread block using a merge sorting method.
+
+
+```cpp showLineNumbers={false}
+void cub::BlockMergeSortStrategy>::Sort(
+ KeyT (&keys)[ItemsPerThread],
+ CompareOp compare_op
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+**Parameters**
+
+
+Keys to sort
+
+
+
+Comparison function object which returns true if the first argument is ordered before the second
+
+
+### StableSort inline
+
+Sorts items partitioned across a CUDA thread block using a merge sorting method.
+
+
+```cpp showLineNumbers={false}
+void cub::BlockMergeSortStrategy>::StableSort(
+ KeyT (&keys)[ItemsPerThread],
+ CompareOp compare_op
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+**Parameters**
+
+
+Keys to sort
+
+
+
+Comparison function object which returns true if the first argument is ordered before the second
+
+
+### SyncImplementation inline const
+
+
+```cpp showLineNumbers={false}
+void cub::BlockMergeSort::SyncImplementation() const
+```
+
+
+### PrivateStorage inline
+
+Internal storage allocator.
+
+
+```cpp showLineNumbers={false}
+_TempStorage & cub::BlockMergeSortStrategy>::PrivateStorage()
+```
+
+
+### Sync inline const
+
+
+```cpp showLineNumbers={false}
+void cub::BlockMergeSortStrategy>::Sync() const
+```
+
+
+---
+
+## Types
+
+### Typedefs
+
+| Name | Definition |
+|---|---|
+| `BlockMergeSortStrategyT` | `BlockMergeSortStrategy< KeyT, ValueT, BLOCK_THREADS, ItemsPerThread, BlockMergeSort >` |
+
+---
+
+## Member variables
+
+| Name | Type | Description |
+|---|---|---|
+| `BLOCK_THREADS` static constexpr | `int` | |
+| `ITEMS_PER_TILE` static constexpr | `int` | |
+| `KEYS_ONLY` static constexpr | `bool` | |
+| `BlockMergeSortStrategyT` | `friend` | |
+| `temp_storage` | `_TempStorage &` | Shared storage reference. |
+| `linear_tid` | `const unsigned int` | |
diff --git a/fern/cudapages/cub/cub/cub/BlockMergeSortStrategy.mdx b/fern/cudapages/cub/cub/cub/BlockMergeSortStrategy.mdx
new file mode 100644
index 0000000..0e59e02
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/BlockMergeSortStrategy.mdx
@@ -0,0 +1,507 @@
+---
+title: cub::BlockMergeSortStrategy
+description: "Generalized merge sort algorithm."
+---
+
+Generalized merge sort algorithm.
+
+This class is used to reduce code duplication. Warp and Block merge sort differ only in how they compute thread index and how they synchronize threads. Since synchronization might require access to custom data (like member mask), CRTP is used.
+
+The code snippet below illustrates the way this class can be used.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+constexpr int BLOCK_THREADS = 256;
+constexpr int ItemsPerThread = 9;
+
+class BlockMergeSort : public BlockMergeSortStrategy
+{
+ using BlockMergeSortStrategyT =
+ BlockMergeSortStrategy;
+public:
+ __device__ __forceinline__ explicit BlockMergeSort(
+ typename BlockMergeSortStrategyT::TempStorage &temp_storage)
+ : BlockMergeSortStrategyT(temp_storage, threadIdx.x)
+ {}
+
+ __device__ __forceinline__ void SyncImplementation() const
+ {
+ __syncthreads();
+ }
+};
+```
+
+
+
+
+
+KeyT type
+
+
+
+ValueT type. cub::NullType indicates a keys-only sort
+
+
+
+
+
+
+
+
+
+Provides a way of synchronizing threads. Should be derived from `BlockMergeSortStrategy`.
+
+
+
+
+
+---
+
+## Constructors
+
+### BlockMergeSortStrategy inline
+
+
+
+
+explicit
+
+
+```cpp showLineNumbers={false}
+cub::BlockMergeSortStrategy::BlockMergeSortStrategy(
+ unsigned int linear_tid
+)
+```
+
+
+
+
+
+
+```cpp showLineNumbers={false}
+cub::BlockMergeSortStrategy::BlockMergeSortStrategy(
+ TempStorage &temp_storage,
+ unsigned int linear_tid
+)
+```
+
+
+
+
+
+
+```cpp showLineNumbers={false}
+cub::BlockMergeSortStrategy::BlockMergeSortStrategy() = delete
+```
+
+
+
+
+
+---
+
+## Methods
+
+### PrivateStorage inline
+
+Internal storage allocator.
+
+
+```cpp showLineNumbers={false}
+_TempStorage & cub::BlockMergeSortStrategy::PrivateStorage()
+```
+
+
+### Sync inline const
+
+
+```cpp showLineNumbers={false}
+void cub::BlockMergeSortStrategy::Sync() const
+```
+
+
+### get_linear_tid inline const
+
+
+```cpp showLineNumbers={false}
+unsigned int cub::BlockMergeSortStrategy::get_linear_tid() const
+```
+
+
+### Sort inline
+
+
+
+
+Sorts items partitioned across a CUDA thread block using a merge sorting method.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockMergeSortStrategy::Sort(
+ KeyT (&keys)[ItemsPerThread],
+ CompareOp compare_op
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+**Template parameters**
+
+
+Functor type having member `bool operator()(KeyT lhs, KeyT rhs)`. `CompareOp` is a model of [Strict Weak Ordering](https://en.cppreference.com/w/cpp/concepts/strict_weak_order).
+
+
+**Parameters**
+
+
+Keys to sort
+
+
+
+Comparison function object which returns true if the first argument is ordered before the second
+
+
+
+
+
+Sorts items partitioned across a CUDA thread block using a merge sorting method.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockMergeSortStrategy::Sort(
+ KeyT (&keys)[ItemsPerThread],
+ CompareOp compare_op,
+ int valid_items,
+ KeyT oob_default
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+**Template parameters**
+
+
+Functor type having member `bool operator()(KeyT lhs, KeyT rhs)`. `CompareOp` is a model of [Strict Weak Ordering](https://en.cppreference.com/w/cpp/concepts/strict_weak_order).
+
+
+**Parameters**
+
+
+Keys to sort
+
+
+
+Comparison function object which returns true if the first argument is ordered before the second
+
+
+
+Number of valid items to sort
+
+
+
+Default value to assign out-of-bound items
+
+
+
+
+
+Sorts items partitioned across a CUDA thread block using a merge sorting method.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockMergeSortStrategy::Sort(
+ KeyT (&keys)[ItemsPerThread],
+ ValueT (&items)[ItemsPerThread],
+ CompareOp compare_op
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+**Template parameters**
+
+
+Functor type having member `bool operator()(KeyT lhs, KeyT rhs)`. `CompareOp` is a model of [Strict Weak Ordering](https://en.cppreference.com/w/cpp/concepts/strict_weak_order).
+
+
+**Parameters**
+
+
+Keys to sort
+
+
+
+Values to sort
+
+
+
+Comparison function object which returns true if the first argument is ordered before the second
+
+
+
+
+
+Sorts items partitioned across a CUDA thread block using a merge sorting method.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockMergeSortStrategy::Sort(
+ KeyT (&keys)[ItemsPerThread],
+ ValueT (&items)[ItemsPerThread],
+ CompareOp compare_op,
+ int valid_items,
+ KeyT oob_default
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+**Template parameters**
+
+
+Functor type having member `bool operator()(KeyT lhs, KeyT rhs)` `CompareOp` is a model of [Strict Weak Ordering](https://en.cppreference.com/w/cpp/concepts/strict_weak_order).
+
+
+
+True if `valid_items` isn't equal to the [`ITEMS_PER_TILE`](/library/api/cub::_block_merge_sort_strategy::ITEMS_PER_TILE)
+
+
+**Parameters**
+
+
+Keys to sort
+
+
+
+Values to sort
+
+
+
+Comparison function object which returns true if the first argument is ordered before the second
+
+
+
+Number of valid items to sort
+
+
+
+Default value to assign out-of-bound items
+
+
+
+
+
+### StableSort inline
+
+
+
+
+Sorts items partitioned across a CUDA thread block using a merge sorting method.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockMergeSortStrategy::StableSort(
+ KeyT (&keys)[ItemsPerThread],
+ CompareOp compare_op
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+**Template parameters**
+
+
+Functor type having member `bool operator()(KeyT lhs, KeyT rhs)`. `CompareOp` is a model of [Strict Weak Ordering](https://en.cppreference.com/w/cpp/concepts/strict_weak_order).
+
+
+**Parameters**
+
+
+Keys to sort
+
+
+
+Comparison function object which returns true if the first argument is ordered before the second
+
+
+
+
+
+Sorts items partitioned across a CUDA thread block using a merge sorting method.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockMergeSortStrategy::StableSort(
+ KeyT (&keys)[ItemsPerThread],
+ ValueT (&items)[ItemsPerThread],
+ CompareOp compare_op
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+**Template parameters**
+
+
+Functor type having member `bool operator()(KeyT lhs, KeyT rhs)`. `CompareOp` is a model of [Strict Weak Ordering](https://en.cppreference.com/w/cpp/concepts/strict_weak_order).
+
+
+**Parameters**
+
+
+Keys to sort
+
+
+
+Values to sort
+
+
+
+Comparison function object which returns true if the first argument is ordered before the second
+
+
+
+
+
+Sorts items partitioned across a CUDA thread block using a merge sorting method.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockMergeSortStrategy::StableSort(
+ KeyT (&keys)[ItemsPerThread],
+ CompareOp compare_op,
+ int valid_items,
+ KeyT oob_default
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+**Template parameters**
+
+
+Functor type having member `bool operator()(KeyT lhs, KeyT rhs)`. `CompareOp` is a model of [Strict Weak Ordering](https://en.cppreference.com/w/cpp/concepts/strict_weak_order).
+
+
+**Parameters**
+
+
+Keys to sort
+
+
+
+Comparison function object which returns true if the first argument is ordered before the second
+
+
+
+Number of valid items to sort
+
+
+
+Default value to assign out-of-bound items
+
+
+
+
+
+Sorts items partitioned across a CUDA thread block using a merge sorting method.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockMergeSortStrategy::StableSort(
+ KeyT (&keys)[ItemsPerThread],
+ ValueT (&items)[ItemsPerThread],
+ CompareOp compare_op,
+ int valid_items,
+ KeyT oob_default
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+**Template parameters**
+
+
+Functor type having member `bool operator()(KeyT lhs, KeyT rhs)`. `CompareOp` is a model of [Strict Weak Ordering](https://en.cppreference.com/w/cpp/concepts/strict_weak_order).
+
+
+
+True if `valid_items` isn't equal to the [`ITEMS_PER_TILE`](/library/api/cub::_block_merge_sort_strategy::ITEMS_PER_TILE)
+
+
+**Parameters**
+
+
+Keys to sort
+
+
+
+Values to sort
+
+
+
+Comparison function object which returns true if the first argument is ordered before the second
+
+
+
+Number of valid items to sort
+
+
+
+Default value to assign out-of-bound items
+
+
+
+
+
+---
+
+## Member variables
+
+| Name | Type | Description |
+|---|---|---|
+| `ITEMS_PER_TILE` static constexpr | `int` | |
+| `KEYS_ONLY` static constexpr | `bool` | |
+| `temp_storage` | `_TempStorage &` | Shared storage reference. |
+| `linear_tid` | `const unsigned int` | |
+
+---
+
+## Inner classes
+
+### TempStorage
+
+
+```cpp showLineNumbers={false}
+struct cub::BlockMergeSortStrategy::TempStorage
+```
+
+
+The operations exposed by [BlockMergeSort](/library/api/cub::_block_merge_sort) require a temporary memory allocation of this nested type for thread communication. This opaque storage can be allocated directly using the `__shared__` keyword. Alternatively, it can be aliased to externally allocated memory (shared or global) or `union`'d with other storage allocation types to facilitate memory reuse.
+
+**Inherits from:** `Uninitialized< _TempStorage >` (public)
diff --git a/fern/cudapages/cub/cub/cub/BlockRadixRank.mdx b/fern/cudapages/cub/cub/cub/BlockRadixRank.mdx
new file mode 100644
index 0000000..5133d5a
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/BlockRadixRank.mdx
@@ -0,0 +1,284 @@
+---
+title: cub::BlockRadixRank
+description: ""
+---
+
+BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block.
+
+## Performance considerations
+
+- Performance is sensitive to the degree of data movement across the block.
+
+Suppose the set of input `keys` across the block of threads is `{ [16,10], [9,11] }`. The extractor will rank only the lowest 5 bits: `{ [16,10], [9,11] }` (bits 0-4). The corresponding output `ranks` in those threads will be `{ [3,1], [0,2] }`.
+
+
+
+
+
+The thread block length in threads along the X dimension
+
+
+
+The number of radix bits per digit place
+
+
+
+Whether or not the sorted-order is high-to-low
+
+
+
+**[optional]** Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure (default: true for architectures SM35 and newer, false otherwise). See [`BlockScanAlgorithm::BLOCK_SCAN_RAKING_MEMOIZE`](/library/api/cub::BLOCK_SCAN_RAKING_MEMOIZE) for more details.
+
+
+
+**[optional]** The [cub::BlockScanAlgorithm](/library/api/cub::BlockScanAlgorithm) algorithm to use (default: [cub::BLOCK_SCAN_WARP_SCANS](/library/api/cub::BLOCK_SCAN_WARP_SCANS))
+
+
+
+**[optional]** Shared memory bank mode (default: `cudaSharedMemBankSizeFourByte`)
+
+
+
+**[optional]** The thread block length in threads along the Y dimension (default: 1)
+
+
+
+**[optional]** The thread block length in threads along the Z dimension (default: 1)
+
+
+
+
+
+---
+
+## Collective constructors
+
+### BlockRadixRank inline
+
+
+
+
+Collective constructor using a private static allocation of shared memory as temporary storage.
+
+
+```cpp showLineNumbers={false}
+cub::BlockRadixRank::BlockRadixRank()
+```
+
+
+
+
+
+Collective constructor using the specified memory allocation as temporary storage.
+
+
+```cpp showLineNumbers={false}
+cub::BlockRadixRank::BlockRadixRank(
+ TempStorage &temp_storage
+)
+```
+
+
+**Parameters**
+
+
+Reference to memory allocation having layout type [TempStorage](/library/api/cub::BlockRadixRank::TempStorage)
+
+
+
+
+
+---
+
+## Raking
+
+### RankKeys inline
+
+
+
+
+Rank keys.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockRadixRank::RankKeys(
+ UnsignedBits (&keys)[KEYS_PER_THREAD],
+ int (&ranks)[KEYS_PER_THREAD],
+ DigitExtractorT digit_extractor
+)
+```
+
+
+**Parameters**
+
+
+Keys for this tile
+
+
+
+For each key, the local rank within the tile
+
+
+
+The digit extractor
+
+
+
+
+
+Rank keys.
+
+For the lower `RADIX_DIGITS` threads, digit counts for each digit are provided for the corresponding thread.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockRadixRank::RankKeys(
+ UnsignedBits (&keys)[KEYS_PER_THREAD],
+ int (&ranks)[KEYS_PER_THREAD],
+ DigitExtractorT digit_extractor,
+ int (&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD]
+)
+```
+
+
+**Parameters**
+
+
+Keys for this tile
+
+
+
+For each key, the local rank within the tile (out parameter)
+
+
+
+The digit extractor
+
+
+
+The exclusive prefix sum for the digits [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1]
+
+
+
+
+
+---
+
+## Utility methods
+
+### PrivateStorage inline
+
+Internal storage allocator.
+
+
+```cpp showLineNumbers={false}
+_TempStorage & cub::BlockRadixRank::PrivateStorage()
+```
+
+
+### Upsweep inline
+
+Performs upsweep raking reduction, returning the aggregate.
+
+
+```cpp showLineNumbers={false}
+PackedCounter cub::BlockRadixRank::Upsweep()
+```
+
+
+### ExclusiveDownsweep inline
+
+Performs exclusive downsweep raking scan.
+
+
+```cpp showLineNumbers={false}
+void cub::BlockRadixRank::ExclusiveDownsweep(
+ PackedCounter raking_partial
+)
+```
+
+
+### ResetCounters inline
+
+Reset shared memory digit counters.
+
+
+```cpp showLineNumbers={false}
+void cub::BlockRadixRank::ResetCounters()
+```
+
+
+### ScanCounters inline
+
+Scan shared memory digit counters.
+
+
+```cpp showLineNumbers={false}
+void cub::BlockRadixRank::ScanCounters()
+```
+
+
+---
+
+## Types
+
+### Typedefs
+
+| Name | Definition | Description |
+|---|---|---|
+| `DigitCounter` | `unsigned short` | |
+| `PackedCounter` | `::cuda::std::_If< SMemConfig==cudaSharedMemBankSizeEightByte, unsigned long long, unsigned int >` | |
+| `BlockScan` | `BlockScan< PackedCounter, BlockDimX, InnerScanAlgorithm, BlockDimY, BlockDimZ >` | [BlockScan](/library/api/cub::BlockRadixRank::BlockScan) type. |
+
+---
+
+## Member variables
+
+| Name | Type | Description |
+|---|---|---|
+| `max_tile_size` static constexpr | `DigitCounter` | |
+| `BLOCK_THREADS` static constexpr | `int` | |
+| `RADIX_DIGITS` static constexpr | `int` | |
+| `LOG_WARP_THREADS` static constexpr | `int` | |
+| `WARP_THREADS` static constexpr | `int` | |
+| `WARPS` static constexpr | `int` | |
+| `BYTES_PER_COUNTER` static constexpr | `int` | |
+| `LOG_BYTES_PER_COUNTER` static constexpr | `int` | |
+| `PACKING_RATIO` static constexpr | `int` | |
+| `LOG_PACKING_RATIO` static constexpr | `int` | |
+| `LOG_COUNTER_LANES` static constexpr | `int` | |
+| `COUNTER_LANES` static constexpr | `int` | |
+| `PADDED_COUNTER_LANES` static constexpr | `int` | |
+| `RAKING_SEGMENT` static constexpr | `int` | |
+| `BINS_TRACKED_PER_THREAD` static constexpr | `int` | Number of bin-starting offsets tracked per thread. |
+| `temp_storage` | `_TempStorage &` | Shared storage reference. |
+| `linear_tid` | `unsigned int` | Linear thread-id. |
+| `cached_segment` | `PackedCounter` | Copy of raking segment, promoted to registers. |
+
+---
+
+## Inner classes
+
+### PrefixCallBack
+
+
+```cpp showLineNumbers={false}
+struct cub::BlockRadixRank::PrefixCallBack
+```
+
+
+Block-scan prefix callback.
+
+### TempStorage
+
+
+```cpp showLineNumbers={false}
+struct cub::BlockRadixRank::TempStorage
+```
+
+
+The operations exposed by [BlockScan](/library/api/cub::BlockRadixRank::BlockScan) require a temporary memory allocation of this nested type for thread communication. This opaque storage can be allocated directly using the `__shared__` keyword. Alternatively, it can be aliased to externally allocated memory (shared or global) or `union`'d with other storage allocation types to facilitate memory reuse.
+
+**Inherits from:** `Uninitialized< _TempStorage >` (public)
diff --git a/fern/cudapages/cub/cub/cub/BlockRadixRankEmptyCallback.mdx b/fern/cudapages/cub/cub/cub/BlockRadixRankEmptyCallback.mdx
new file mode 100644
index 0000000..4450e4c
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/BlockRadixRankEmptyCallback.mdx
@@ -0,0 +1,29 @@
+---
+title: cub::BlockRadixRankEmptyCallback
+description: "Empty callback implementation."
+---
+
+Empty callback implementation.
+
+
+
+
+
+
+
+
+
+
+---
+
+## Methods
+
+### operator() inline
+
+
+```cpp showLineNumbers={false}
+void cub::BlockRadixRankEmptyCallback::operator()(
+ int (&bins)[BINS_PER_THREAD]
+)
+```
+
diff --git a/fern/cudapages/cub/cub/cub/BlockRadixRankMatch.mdx b/fern/cudapages/cub/cub/cub/BlockRadixRankMatch.mdx
new file mode 100644
index 0000000..31e19bc
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/BlockRadixRankMatch.mdx
@@ -0,0 +1,246 @@
+---
+title: cub::BlockRadixRankMatch
+description: "Radix-rank using match.any."
+---
+
+Radix-rank using match.any.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+
+## Collective constructors
+
+### BlockRadixRankMatch inline
+
+Collective constructor using the specified memory allocation as temporary storage.
+
+
+```cpp showLineNumbers={false}
+cub::BlockRadixRankMatch::BlockRadixRankMatch(
+ TempStorage &temp_storage
+)
+```
+
+
+**Parameters**
+
+
+Reference to memory allocation having layout type [TempStorage](/library/api/cub::BlockRadixRankMatch::TempStorage)
+
+
+---
+
+## Raking
+
+### CallBack inline
+
+Computes the count of keys for each digit value, and calls the callback with the array of key counts.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockRadixRankMatch::CallBack(
+ CountsCallback callback
+)
+```
+
+
+**Template parameters**
+
+
+The callback type. It should implement an instance overload of operator()(int (&bins)[BINS_TRACKED_PER_THREAD]), where bins is an array of key counts for each digit value distributed in block distribution among the threads of the thread block. Key counts can be used, to update other data structures in global or shared memory. Depending on the implementation of the ranking algoirhtm (see [BlockRadixRankMatchEarlyCounts](/library/api/cub::_block_radix_rank_match_early_counts)), key counts may become available early, therefore, they are returned through a callback rather than a separate output parameter of [RankKeys()](/library/api/cub::_block_radix_rank_match::RankKeys()).
+
+
+### RankKeys inline
+
+
+
+
+Rank keys.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockRadixRankMatch::RankKeys(
+ UnsignedBits (&keys)[KEYS_PER_THREAD],
+ int (&ranks)[KEYS_PER_THREAD],
+ DigitExtractorT digit_extractor,
+ CountsCallback callback
+)
+```
+
+
+**Parameters**
+
+
+Keys for this tile
+
+
+
+For each key, the local rank within the tile
+
+
+
+The digit extractor
+
+
+
+
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockRadixRankMatch::RankKeys(
+ UnsignedBits (&keys)[KEYS_PER_THREAD],
+ int (&ranks)[KEYS_PER_THREAD],
+ DigitExtractorT digit_extractor
+)
+```
+
+
+
+
+
+Rank keys.
+
+For the lower `RADIX_DIGITS` threads, digit counts for each digit are provided for the corresponding thread.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockRadixRankMatch::RankKeys(
+ UnsignedBits (&keys)[KEYS_PER_THREAD],
+ int (&ranks)[KEYS_PER_THREAD],
+ DigitExtractorT digit_extractor,
+ int (&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD],
+ CountsCallback callback
+)
+```
+
+
+**Parameters**
+
+
+Keys for this tile
+
+
+
+For each key, the local rank within the tile (out parameter)
+
+
+
+The digit extractor
+
+
+
+The exclusive prefix sum for the digits [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1]
+
+
+
+
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockRadixRankMatch::RankKeys(
+ UnsignedBits (&keys)[KEYS_PER_THREAD],
+ int (&ranks)[KEYS_PER_THREAD],
+ DigitExtractorT digit_extractor,
+ int (&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD]
+)
+```
+
+
+**Parameters**
+
+
+Keys for this tile
+
+
+
+For each key, the local rank within the tile (out parameter)
+
+
+
+The exclusive prefix sum for the digits [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1]
+
+
+
+
+
+---
+
+## Types
+
+### Typedefs
+
+| Name | Definition | Description |
+|---|---|---|
+| `RankT` | `int32_t` | |
+| `DigitCounterT` | `int32_t` | |
+| `BlockScanT` | `BlockScan< DigitCounterT, BLOCK_THREADS, InnerScanAlgorithm, BlockDimY, BlockDimZ >` | [BlockScan](/library/api/cub::_block_scan) type. |
+
+---
+
+## Member variables
+
+| Name | Type | Description |
+|---|---|---|
+| `BLOCK_THREADS` static constexpr | `int` | |
+| `RADIX_DIGITS` static constexpr | `int` | |
+| `LOG_WARP_THREADS` static constexpr | `int` | |
+| `WARP_THREADS` static constexpr | `int` | |
+| `PARTIAL_WARP_THREADS` static constexpr | `int` | |
+| `WARPS` static constexpr | `int` | |
+| `PADDED_WARPS` static constexpr | `int` | |
+| `COUNTERS` static constexpr | `int` | |
+| `RAKING_SEGMENT` static constexpr | `int` | |
+| `PADDED_RAKING_SEGMENT` static constexpr | `int` | |
+| `BINS_TRACKED_PER_THREAD` static constexpr | `int` | Number of bin-starting offsets tracked per thread. |
+| `temp_storage` | `_TempStorage &` | Shared storage reference. |
+| `linear_tid` | `unsigned int` | Linear thread-id. |
+
+---
+
+## Inner classes
+
+### TempStorage
+
+
+```cpp showLineNumbers={false}
+struct cub::BlockRadixRankMatch::TempStorage
+```
+
+
+The operations exposed by `BlockRadixRankMatch` require a temporary memory allocation of this nested type for thread communication. This opaque storage can be allocated directly using the `__shared__` keyword. Alternatively, it can be aliased to externally allocated memory (shared or global) or `union`'d with other storage allocation types to facilitate memory reuse.
+
+**Inherits from:** `Uninitialized< _TempStorage >` (public)
diff --git a/fern/cudapages/cub/cub/cub/BlockRadixRankMatchEarlyCounts.mdx b/fern/cudapages/cub/cub/cub/BlockRadixRankMatchEarlyCounts.mdx
new file mode 100644
index 0000000..de40352
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/BlockRadixRankMatchEarlyCounts.mdx
@@ -0,0 +1,173 @@
+---
+title: cub::BlockRadixRankMatchEarlyCounts
+description: "Radix-rank using matching which computes the counts of keys for each digit value early, at the expense of doing more work."
+---
+
+Radix-rank using matching which computes the counts of keys for each digit value early, at the expense of doing more work.
+
+This may be useful e.g. for decoupled look-back, where it reduces the time other thread blocks need to wait for digit counts to become available.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+
+## Constructors
+
+### BlockRadixRankMatchEarlyCounts inline
+
+
+```cpp showLineNumbers={false}
+cub::BlockRadixRankMatchEarlyCounts::BlockRadixRankMatchEarlyCounts(
+ TempStorage &temp_storage
+)
+```
+
+
+---
+
+## Methods
+
+### RankKeys inline
+
+
+
+
+Rank keys.
+
+For the lower `RADIX_DIGITS` threads, digit counts for each digit are provided for the corresponding thread.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockRadixRankMatchEarlyCounts::RankKeys(
+ UnsignedBits (&keys)[KEYS_PER_THREAD],
+ int (&ranks)[KEYS_PER_THREAD],
+ DigitExtractorT digit_extractor,
+ int (&exclusive_digit_prefix)[BINS_PER_THREAD],
+ CountsCallback callback
+)
+```
+
+
+
+
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockRadixRankMatchEarlyCounts::RankKeys(
+ UnsignedBits (&keys)[KEYS_PER_THREAD],
+ int (&ranks)[KEYS_PER_THREAD],
+ DigitExtractorT digit_extractor,
+ int (&exclusive_digit_prefix)[BINS_PER_THREAD]
+)
+```
+
+
+
+
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockRadixRankMatchEarlyCounts::RankKeys(
+ UnsignedBits (&keys)[KEYS_PER_THREAD],
+ int (&ranks)[KEYS_PER_THREAD],
+ DigitExtractorT digit_extractor
+)
+```
+
+
+
+
+
+---
+
+## Types
+
+### Typedefs
+
+| Name | Definition |
+|---|---|
+| `BlockScan` | `cub::BlockScan< int, BLOCK_THREADS, InnerScanAlgorithm >` |
+
+---
+
+## Member variables
+
+| Name | Type | Description |
+|---|---|---|
+| `BLOCK_THREADS` static constexpr | `int` | |
+| `RADIX_DIGITS` static constexpr | `int` | |
+| `BINS_PER_THREAD` static constexpr | `int` | |
+| `BINS_TRACKED_PER_THREAD` static constexpr | `int` | |
+| `FULL_BINS` static constexpr | `int` | |
+| `WARP_THREADS` static constexpr | `int` | |
+| `PARTIAL_WARP_THREADS` static constexpr | `int` | |
+| `BLOCK_WARPS` static constexpr | `int` | |
+| `PARTIAL_WARP_ID` static constexpr | `int` | |
+| `WARP_MASK` static constexpr | `int` | |
+| `NUM_MATCH_MASKS` static constexpr | `int` | |
+| `MATCH_MASKS_ALLOC_SIZE` static constexpr | `int` | |
+| `temp_storage` | `TempStorage &` | |
+
+---
+
+## Inner classes
+
+### TempStorage
+
+
+```cpp showLineNumbers={false}
+struct cub::BlockRadixRankMatchEarlyCounts::TempStorage
+```
+
+
+| Name | Type | Description |
+|---|---|---|
+| `warp_offsets` | `int` | |
+| `warp_histograms` | `int` | |
+| `` | `union cub::BlockRadixRankMatchEarlyCounts::TempStorage` | |
+| `match_masks` | `::cuda::std::uint32_t` | |
+| `prefix_tmp` | `BlockScan::TempStorage` | |
+
+### BlockRadixRankMatchInternal
+
+
+```cpp showLineNumbers={false}
+struct cub::BlockRadixRankMatchEarlyCounts::BlockRadixRankMatchInternal
+```
+
+
+| Name | Type | Description |
+|---|---|---|
+| `s` | `TempStorage &` | |
+| `digit_extractor` | `DigitExtractorT` | |
+| `callback` | `CountsCallback` | |
+| `warp` | `int` | |
+| `lane` | `int` | |
diff --git a/fern/cudapages/cub/cub/cub/BlockRadixSort.mdx b/fern/cudapages/cub/cub/cub/BlockRadixSort.mdx
new file mode 100644
index 0000000..53da08e
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/BlockRadixSort.mdx
@@ -0,0 +1,1753 @@
+---
+title: cub::BlockRadixSort
+description: ""
+---
+
+BlockRadixSort class provides collective methods for sorting items partitioned across a CUDA thread block using a radix sorting method.
+
+
+
+The [radix sorting method](http://en.wikipedia.org/wiki/Radix_sort) arranges items into ascending order. It relies upon a positional representation for keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits, characters, etc.) specified from least-significant to most-significant. For a given input sequence of keys and a set of rules specifying a total ordering of the symbolic alphabet, the radix sorting method produces a lexicographic ordering of those keys.
+
+Assumes threads are in row-major order.
+
+
+BlockRadixSort can sort all of the built-in C++ numeric primitive types (`unsigned char`, `int`, `double`, etc.) as well as CUDA's `__half` half-precision floating-point type. User-defined types are supported as long as decomposer object is provided.
+
+
+- Positive and negative zeros are considered equivalent, and will be treated
+ as such in the output.
+- No special handling is implemented for NaN values; these are sorted
+ according to their bit representations after any transformations.
+
+
+Although the direct radix sorting method can only be applied to unsigned integral types, BlockRadixSort is able to sort signed and floating-point types via simple bit-wise transformations that ensure lexicographic key ordering.
+
+These transformations must be considered when restricting the `[begin_bit, end_bit)` range, as the bitwise transformations will occur before the bit-range truncation.
+
+Any transformations applied to the keys prior to sorting are reversed while writing to the final output buffer.
+
+
+To convert the input values into a radix-sortable bitwise representation, the following transformations take place prior to sorting:
+
+* For unsigned integral values, the keys are used directly.
+* For signed integral values, the sign bit is inverted.
+* For positive floating point values, the sign bit is inverted.
+* For negative floating point values, the full key is inverted.
+
+
+Unlike `DeviceRadixSort`, `BlockRadixSort` does not invert the input key bits when performing a descending sort. Instead, it has special logic to reverse the order of the keys while sorting.
+
+
+BlockRadixSort is stable. For floating-point types -0.0 and +0.0 are considered equal and appear in the result in the same order as they appear in the input.
+
+
+
+* Performance is sensitive to the degree of data movement across the block.
+
+
+
+
+The code snippet below illustrates a sort of 512 integer keys that are partitioned in a [blocked arrangement](../index.html#sec5sec3) across 128 threads where each thread owns 4 consecutive items.
+
+.. tab-set-code::
+
+Suppose the set of input `thread_keys` across the block of threads is `{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }`. The corresponding output `thread_keys` in those threads will be `{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }`.
+
+
+The `block/example_block_reduce_dyn_smem.cu` example illustrates usage of dynamically shared memory with BlockReduce and how to re-purpose the same memory region.
+
+This example can be easily adapted to the storage required by BlockRadixSort.
+
+
+
+
+
+KeyT type
+
+
+
+The thread block length in threads along the X dimension
+
+
+
+The number of items per thread
+
+
+
+**[optional]** ValueT type (default: cub::NullType, which indicates a keys-only sort)
+
+
+
+**[optional]** The number of radix bits per digit place (default: 4 bits)
+
+
+
+**[optional]** Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure (default: true for architectures SM35 and newer, false otherwise).
+
+
+
+**[optional]** The [cub::BlockScanAlgorithm](/library/api/cub::BlockScanAlgorithm) algorithm to use (default: [cub::BLOCK_SCAN_WARP_SCANS](/library/api/cub::BLOCK_SCAN_WARP_SCANS))
+
+
+
+**[Optional]*8 Shared memory bank mode (default: `cudaSharedMemBankSizeFourByte`)
+
+
+
+**[optional]** The thread block length in threads along the Y dimension (default: 1)
+
+
+
+**[optional]** The thread block length in threads along the Z dimension (default: 1)
+
+
+
+
+
+---
+
+## Collective constructors
+
+### BlockRadixSort inline
+
+
+
+
+Collective constructor using a private static allocation of shared memory as temporary storage.
+
+
+```cpp showLineNumbers={false}
+cub::BlockRadixSort::BlockRadixSort()
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+
+
+Collective constructor using the specified memory allocation as temporary storage.
+
+
+```cpp showLineNumbers={false}
+cub::BlockRadixSort::BlockRadixSort(
+ TempStorage &temp_storage
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+**Parameters**
+
+
+Reference to memory allocation having layout type [TempStorage](/library/api/cub::BlockRadixSort::TempStorage)
+
+
+
+
+
+---
+
+## Sorting (blocked arrangements)
+
+### Sort inline
+
+
+
+
+Performs an ascending block-wide radix sort over a blocked arrangement of keys.
+
+
+```cpp showLineNumbers={false}
+void cub::BlockRadixSort::Sort(
+ KeyT (&keys)[ItemsPerThread],
+ int begin_bit = 0,
+ int end_bit = sizeof(KeyT) *8
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+Performance is sensitive to the degree of data movement across the block.
+
+
+
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Parameters**
+
+
+Keys to sort
+
+
+
+**[optional]** The beginning (least-significant) bit index needed for key comparison
+
+
+
+**[optional]** The past-the-end (most-significant) bit index needed for key comparison
+
+
+**Example**
+
+The code snippet below illustrates a sort of 512 integer keys that are partitioned in a blocked arrangement across 128 threads where each thread owns 4 consecutive keys.
+
+`{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }`. The corresponding output `thread_keys` in those threads will be `{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }`.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+__global__ void ExampleKernel(...)
+{
+ // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
+ using BlockRadixSort = cub::BlockRadixSort;
+
+ // Allocate shared memory for BlockRadixSort
+ __shared__ typename BlockRadixSort::TempStorage temp_storage;
+
+ // Obtain a segment of consecutive items that are blocked across threads
+ int thread_keys[4];
+ ...
+
+ // Collectively sort the keys
+ BlockRadixSort(temp_storage).Sort(thread_keys);
+```
+
+
+
+
+Performs an ascending block-wide radix sort over a blocked arrangement of keys.
+
+Let's consider a user-defined `custom_t` type below. To sort an array of `custom_t` objects, we have to tell CUB about relevant members of the `custom_t` type. We do this by providing a decomposer that returns a tuple of references to relevant members of the key.
+
+.. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu :language: c++ :dedent: :start-after: example-begin custom-type :end-before: example-end custom-type
+
+The code snippet below illustrates a sort of 2 keys that are partitioned in a blocked arrangement across 2 threads where each thread owns 1 key.
+
+.. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu :language: c++ :dedent: :start-after: example-begin keys-bits :end-before: example-end keys-bits
+
+
+```cpp showLineNumbers={false}
+template
+::cuda::std::enable_if_t> cub::BlockRadixSort::Sort(
+ KeyT (&keys)[ItemsPerThread],
+ DecomposerT decomposer,
+ int begin_bit,
+ int end_bit
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+* Performance is sensitive to the degree of data movement across the block. * The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Template parameters**
+
+
+**[inferred]** Type of a callable object responsible for decomposing a `KeyT` into a tuple of references to its constituent arithmetic types: `::cuda::std::tuple operator()(KeyT &key)`. The leftmost element of the tuple is considered the most significant. The call operator must not modify members of the key.
+
+
+**Parameters**
+
+
+Keys to sort
+
+
+
+Callable object responsible for decomposing a `KeyT` into a tuple of references to its constituent arithmetic types. The leftmost element of the tuple is considered the most significant. The call operator must not modify members of the key.
+
+
+
+The least-significant bit index (inclusive) needed for key comparison
+
+
+
+The most-significant bit index (exclusive) needed for key comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
+
+
+
+
+
+Performs an ascending block-wide radix sort over a blocked arrangement of keys.
+
+Let's consider a user-defined `custom_t` type below. To sort an array of `custom_t` objects, we have to tell CUB about relevant members of the `custom_t` type. We do this by providing a decomposer that returns a tuple of references to relevant members of the key.
+
+.. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu :language: c++ :dedent: :start-after: example-begin custom-type :end-before: example-end custom-type
+
+The code snippet below illustrates a sort of 6 keys that are partitioned in a blocked arrangement across 2 threads where each thread owns 3 consecutive keys.
+
+.. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu :language: c++ :dedent: :start-after: example-begin keys :end-before: example-end keys
+
+
+```cpp showLineNumbers={false}
+template
+::cuda::std::enable_if_t> cub::BlockRadixSort::Sort(
+ KeyT (&keys)[ItemsPerThread],
+ DecomposerT decomposer
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+* Performance is sensitive to the degree of data movement across the block. * The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Template parameters**
+
+
+**[inferred]** Type of a callable object responsible for decomposing a `KeyT` into a tuple of references to its constituent arithmetic types: `::cuda::std::tuple operator()(KeyT &key)`. The leftmost element of the tuple is considered the most significant. The call operator must not modify members of the key.
+
+
+**Parameters**
+
+
+Keys to sort
+
+
+
+Callable object responsible for decomposing a `KeyT` into a tuple of references to its constituent arithmetic types. The leftmost element of the tuple is considered the most significant. The call operator must not modify members of the key.
+
+
+
+
+
+Performs an ascending block-wide radix sort across a blocked arrangement of keys and values.
+
+
+```cpp showLineNumbers={false}
+void cub::BlockRadixSort::Sort(
+ KeyT (&keys)[ItemsPerThread],
+ ValueT (&values)[ItemsPerThread],
+ int begin_bit = 0,
+ int end_bit = sizeof(KeyT) *8
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+BlockRadixSort can only accommodate one associated tile of values. To "truck along" more than one tile of values, simply perform a key-value sort of the keys paired with a temporary value array that enumerates the key indices. The reordered indices can then be used as a gather-vector for exchanging other associated tile data through shared memory.
+Performance is sensitive to the degree of data movement across the block.
+
+
+
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Parameters**
+
+
+Keys to sort
+
+
+
+Values to sort
+
+
+
+**[optional]** The beginning (least-significant) bit index needed for key comparison
+
+
+
+**[optional]** The past-the-end (most-significant) bit index needed for key comparison
+
+
+**Example**
+
+The code snippet below illustrates a sort of 512 integer keys and values that are partitioned in a blocked arrangement across 128 threads where each thread owns 4 consecutive pairs.
+
+Suppose the set of input `thread_keys` across the block of threads is `{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }`. The corresponding output `thread_keys` in those threads will be `{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }`.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+__global__ void ExampleKernel(...)
+{
+ // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each
+ using BlockRadixSort = cub::BlockRadixSort;
+
+ // Allocate shared memory for BlockRadixSort
+ __shared__ typename BlockRadixSort::TempStorage temp_storage;
+
+ // Obtain a segment of consecutive items that are blocked across threads
+ int thread_keys[4];
+ int thread_values[4];
+ ...
+
+ // Collectively sort the keys and values among block threads
+ BlockRadixSort(temp_storage).Sort(thread_keys, thread_values);
+```
+
+
+
+
+Performs an ascending block-wide radix sort over a blocked arrangement of keys and values.
+
+Let's consider a user-defined `custom_t` type below. To sort an array of `custom_t` objects, we have to tell CUB about relevant members of the `custom_t` type. We do this by providing a decomposer that returns a tuple of references to relevant members of the key.
+
+.. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu :language: c++ :dedent: :start-after: example-begin custom-type :end-before: example-end custom-type
+
+The code snippet below illustrates a sort of 2 keys and values that are partitioned in a blocked arrangement across 2 threads where each thread owns 1 pair.
+
+.. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu :language: c++ :dedent: :start-after: example-begin pairs-bits :end-before: example-end pairs-bits
+
+
+```cpp showLineNumbers={false}
+template
+::cuda::std::enable_if_t> cub::BlockRadixSort::Sort(
+ KeyT (&keys)[ItemsPerThread],
+ ValueT (&values)[ItemsPerThread],
+ DecomposerT decomposer,
+ int begin_bit,
+ int end_bit
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+* BlockRadixSort can only accommodate one associated tile of values. To "truck along" more than one tile of values, simply perform a key-value sort of the keys paired with a temporary value array that enumerates the key indices. The reordered indices can then be used as a gather-vector for exchanging other associated tile data through shared memory. * Performance is sensitive to the degree of data movement across the block. * The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Template parameters**
+
+
+**[inferred]** Type of a callable object responsible for decomposing a `KeyT` into a tuple of references to its constituent arithmetic types: `::cuda::std::tuple operator()(KeyT &key)`. The leftmost element of the tuple is considered the most significant. The call operator must not modify members of the key.
+
+
+**Parameters**
+
+
+Keys to sort
+
+
+
+Values to sort
+
+
+
+Callable object responsible for decomposing a `KeyT` into a tuple of references to its constituent arithmetic types. The leftmost element of the tuple is considered the most significant. The call operator must not modify members of the key.
+
+
+
+The least-significant bit index (inclusive) needed for key comparison
+
+
+
+The most-significant bit index (exclusive) needed for key comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
+
+
+
+
+
+Performs an ascending block-wide radix sort over a blocked arrangement of keys and values.
+
+Let's consider a user-defined `custom_t` type below. To sort an array of `custom_t` objects, we have to tell CUB about relevant members of the `custom_t` type. We do this by providing a decomposer that returns a tuple of references to relevant members of the key.
+
+.. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu :language: c++ :dedent: :start-after: example-begin custom-type :end-before: example-end custom-type
+
+The code snippet below illustrates a sort of 6 keys and values that are partitioned in a blocked arrangement across 2 threads where each thread owns 3 consecutive pairs.
+
+.. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu :language: c++ :dedent: :start-after: example-begin pairs :end-before: example-end pairs
+
+
+```cpp showLineNumbers={false}
+template
+::cuda::std::enable_if_t> cub::BlockRadixSort::Sort(
+ KeyT (&keys)[ItemsPerThread],
+ ValueT (&values)[ItemsPerThread],
+ DecomposerT decomposer
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+* BlockRadixSort can only accommodate one associated tile of values. To "truck along" more than one tile of values, simply perform a key-value sort of the keys paired with a temporary value array that enumerates the key indices. The reordered indices can then be used as a gather-vector for exchanging other associated tile data through shared memory. * Performance is sensitive to the degree of data movement across the block. * The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Template parameters**
+
+
+**[inferred]** Type of a callable object responsible for decomposing a `KeyT` into a tuple of references to its constituent arithmetic types: `::cuda::std::tuple operator()(KeyT &key)`. The leftmost element of the tuple is considered the most significant. The call operator must not modify members of the key.
+
+
+**Parameters**
+
+
+Keys to sort
+
+
+
+Values to sort
+
+
+
+Callable object responsible for decomposing a `KeyT` into a tuple of references to its constituent arithmetic types. The leftmost element of the tuple is considered the most significant. The call operator must not modify members of the key.
+
+
+
+
+
+### SortDescending inline
+
+
+
+
+Performs a descending block-wide radix sort over a blocked arrangement of keys.
+
+
+```cpp showLineNumbers={false}
+void cub::BlockRadixSort::SortDescending(
+ KeyT (&keys)[ItemsPerThread],
+ int begin_bit = 0,
+ int end_bit = sizeof(KeyT) *8
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+Performance is sensitive to the degree of data movement across the block.
+
+
+
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Parameters**
+
+
+Keys to sort
+
+
+
+**[optional]** The beginning (least-significant) bit index needed for key comparison
+
+
+
+**[optional]** The past-the-end (most-significant) bit index needed for key comparison
+
+
+**Example**
+
+The code snippet below illustrates a sort of 512 integer keys that are partitioned in a [blocked arrangement](../index.html#sec5sec3) across 128 threads where each thread owns 4 consecutive keys.
+
+`{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }`. The corresponding output `thread_keys` in those threads will be `{ [511,510,509,508], [11,10,9,8], [7,6,5,4], ..., [3,2,1,0] }`.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+__global__ void ExampleKernel(...)
+{
+ // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
+ using BlockRadixSort = cub::BlockRadixSort;
+
+ // Allocate shared memory for BlockRadixSort
+ __shared__ typename BlockRadixSort::TempStorage temp_storage;
+
+ // Obtain a segment of consecutive items that are blocked across threads
+ int thread_keys[4];
+ ...
+
+ // Collectively sort the keys
+ BlockRadixSort(temp_storage).Sort(thread_keys);
+```
+
+
+
+
+Performs a descending block-wide radix sort over a blocked arrangement of keys.
+
+Let's consider a user-defined `custom_t` type below. To sort an array of `custom_t` objects, we have to tell CUB about relevant members of the `custom_t` type. We do this by providing a decomposer that returns a tuple of references to relevant members of the key.
+
+.. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu :language: c++ :dedent: :start-after: example-begin custom-type :end-before: example-end custom-type
+
+The code snippet below illustrates a sort of 2 keys that are partitioned in a blocked arrangement across 2 threads where each thread owns 1 key.
+
+.. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu :language: c++ :dedent: :start-after: example-begin keys-descending-bits :end-before: example-end keys-descending-bits
+
+
+```cpp showLineNumbers={false}
+template
+::cuda::std::enable_if_t> cub::BlockRadixSort::SortDescending(
+ KeyT (&keys)[ItemsPerThread],
+ DecomposerT decomposer,
+ int begin_bit,
+ int end_bit
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+* Performance is sensitive to the degree of data movement across the block. * The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Template parameters**
+
+
+**[inferred]** Type of a callable object responsible for decomposing a `KeyT` into a tuple of references to its constituent arithmetic types: `::cuda::std::tuple operator()(KeyT &key)`. The leftmost element of the tuple is considered the most significant. The call operator must not modify members of the key.
+
+
+**Parameters**
+
+
+Keys to sort
+
+
+
+Callable object responsible for decomposing a `KeyT` into a tuple of references to its constituent arithmetic types. The leftmost element of the tuple is considered the most significant. The call operator must not modify members of the key.
+
+
+
+The least-significant bit index (inclusive) needed for key comparison
+
+
+
+The most-significant bit index (exclusive) needed for key comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
+
+
+
+
+
+Performs a descending block-wide radix sort over a blocked arrangement of keys.
+
+Let's consider a user-defined `custom_t` type below. To sort an array of `custom_t` objects, we have to tell CUB about relevant members of the `custom_t` type. We do this by providing a decomposer that returns a tuple of references to relevant members of the key.
+
+.. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu :language: c++ :dedent: :start-after: example-begin custom-type :end-before: example-end custom-type
+
+The code snippet below illustrates a sort of 6 keys that are partitioned in a blocked arrangement across 2 threads where each thread owns 3 consecutive keys.
+
+.. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu :language: c++ :dedent: :start-after: example-begin keys-descending :end-before: example-end keys-descending
+
+
+```cpp showLineNumbers={false}
+template
+::cuda::std::enable_if_t> cub::BlockRadixSort::SortDescending(
+ KeyT (&keys)[ItemsPerThread],
+ DecomposerT decomposer
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+* Performance is sensitive to the degree of data movement across the block. * The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Template parameters**
+
+
+**[inferred]** Type of a callable object responsible for decomposing a `KeyT` into a tuple of references to its constituent arithmetic types: `::cuda::std::tuple operator()(KeyT &key)`. The leftmost element of the tuple is considered the most significant. The call operator must not modify members of the key.
+
+
+**Parameters**
+
+
+Keys to sort
+
+
+
+Callable object responsible for decomposing a `KeyT` into a tuple of references to its constituent arithmetic types. The leftmost element of the tuple is considered the most significant. The call operator must not modify members of the key.
+
+
+
+
+
+Performs a descending block-wide radix sort across a blocked arrangement of keys and values.
+
+
+```cpp showLineNumbers={false}
+void cub::BlockRadixSort::SortDescending(
+ KeyT (&keys)[ItemsPerThread],
+ ValueT (&values)[ItemsPerThread],
+ int begin_bit = 0,
+ int end_bit = sizeof(KeyT) *8
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+BlockRadixSort can only accommodate one associated tile of values. To "truck along" more than one tile of values, simply perform a key-value sort of the keys paired with a temporary value array that enumerates the key indices. The reordered indices can then be used as a gather-vector for exchanging other associated tile data through shared memory.
+Performance is sensitive to the degree of data movement across the block.
+
+
+
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Parameters**
+
+
+Keys to sort
+
+
+
+Values to sort
+
+
+
+**[optional]** The beginning (least-significant) bit index needed for key comparison
+
+
+
+**[optional]** The past-the-end (most-significant) bit index needed for key comparison
+
+
+**Example**
+
+The code snippet below illustrates a sort of 512 integer keys and values that are partitioned in a blocked arrangement across 128 threads where each thread owns 4 consecutive pairs.
+
+`{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }`. The corresponding output `thread_keys` in those threads will be `{ [511,510,509,508], [11,10,9,8], [7,6,5,4], ..., [3,2,1,0] }`.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+__global__ void ExampleKernel(...)
+{
+ // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each
+ using BlockRadixSort = cub::BlockRadixSort;
+
+ // Allocate shared memory for BlockRadixSort
+ __shared__ typename BlockRadixSort::TempStorage temp_storage;
+
+ // Obtain a segment of consecutive items that are blocked across threads
+ int thread_keys[4];
+ int thread_values[4];
+ ...
+
+ // Collectively sort the keys and values among block threads
+ BlockRadixSort(temp_storage).Sort(thread_keys, thread_values);
+```
+
+
+
+
+Performs a descending block-wide radix sort over a blocked arrangement of keys and values.
+
+Let's consider a user-defined `custom_t` type below. To sort an array of `custom_t` objects, we have to tell CUB about relevant members of the `custom_t` type. We do this by providing a decomposer that returns a tuple of references to relevant members of the key.
+
+.. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu :language: c++ :dedent: :start-after: example-begin custom-type :end-before: example-end custom-type
+
+The code snippet below illustrates a sort of 2 pairs that are partitioned in a blocked arrangement across 2 threads where each thread owns 1 pair.
+
+.. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu :language: c++ :dedent: :start-after: example-begin pairs-descending-bits :end-before: example-end pairs-descending-bits
+
+
+```cpp showLineNumbers={false}
+template
+::cuda::std::enable_if_t> cub::BlockRadixSort::SortDescending(
+ KeyT (&keys)[ItemsPerThread],
+ ValueT (&values)[ItemsPerThread],
+ DecomposerT decomposer,
+ int begin_bit,
+ int end_bit
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+* BlockRadixSort can only accommodate one associated tile of values. To "truck along" more than one tile of values, simply perform a key-value sort of the keys paired with a temporary value array that enumerates the key indices. The reordered indices can then be used as a gather-vector for exchanging other associated tile data through shared memory. * Performance is sensitive to the degree of data movement across the block. * The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Template parameters**
+
+
+**[inferred]** Type of a callable object responsible for decomposing a `KeyT` into a tuple of references to its constituent arithmetic types: `::cuda::std::tuple operator()(KeyT &key)`. The leftmost element of the tuple is considered the most significant. The call operator must not modify members of the key.
+
+
+**Parameters**
+
+
+Keys to sort
+
+
+
+Values to sort
+
+
+
+Callable object responsible for decomposing a `KeyT` into a tuple of references to its constituent arithmetic types. The leftmost element of the tuple is considered the most significant. The call operator must not modify members of the key.
+
+
+
+The least-significant bit index (inclusive) needed for key comparison
+
+
+
+The most-significant bit index (exclusive) needed for key comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
+
+
+
+
+
+Performs a descending block-wide radix sort over a blocked arrangement of keys and values.
+
+Let's consider a user-defined `custom_t` type below. To sort an array of `custom_t` objects, we have to tell CUB about relevant members of the `custom_t` type. We do this by providing a decomposer that returns a tuple of references to relevant members of the key.
+
+.. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu :language: c++ :dedent: :start-after: example-begin custom-type :end-before: example-end custom-type
+
+The code snippet below illustrates a sort of 6 keys and values that are partitioned in a blocked arrangement across 2 threads where each thread owns 3 consecutive pairs.
+
+.. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu :language: c++ :dedent: :start-after: example-begin pairs-descending :end-before: example-end pairs-descending
+
+
+```cpp showLineNumbers={false}
+template
+::cuda::std::enable_if_t> cub::BlockRadixSort::SortDescending(
+ KeyT (&keys)[ItemsPerThread],
+ ValueT (&values)[ItemsPerThread],
+ DecomposerT decomposer
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+* BlockRadixSort can only accommodate one associated tile of values. To "truck along" more than one tile of values, simply perform a key-value sort of the keys paired with a temporary value array that enumerates the key indices. The reordered indices can then be used as a gather-vector for exchanging other associated tile data through shared memory. * Performance is sensitive to the degree of data movement across the block. * The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Template parameters**
+
+
+**[inferred]** Type of a callable object responsible for decomposing a `KeyT` into a tuple of references to its constituent arithmetic types: `::cuda::std::tuple operator()(KeyT &key)`. The leftmost element of the tuple is considered the most significant. The call operator must not modify members of the key.
+
+
+**Parameters**
+
+
+Keys to sort
+
+
+
+Values to sort
+
+
+
+Callable object responsible for decomposing a `KeyT` into a tuple of references to its constituent arithmetic types. The leftmost element of the tuple is considered the most significant. The call operator must not modify members of the key.
+
+
+
+
+
+---
+
+## Sorting (blocked arrangement -> striped arrangement)
+
+### SortBlockedToStriped inline
+
+
+
+
+Performs an ascending radix sort across a blocked arrangement of keys, leaving them in a striped arrangement.
+
+
+```cpp showLineNumbers={false}
+void cub::BlockRadixSort::SortBlockedToStriped(
+ KeyT (&keys)[ItemsPerThread],
+ int begin_bit = 0,
+ int end_bit = sizeof(KeyT) *8
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+Performance is sensitive to the degree of data movement across the block.
+
+
+
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Parameters**
+
+
+Keys to sort
+
+
+
+**[optional]** The beginning (least-significant) bit index needed for key comparison
+
+
+
+**[optional]** The past-the-end (most-significant) bit index needed for key comparison
+
+
+**Example**
+
+The code snippet below illustrates a sort of 512 integer keys that are initially partitioned in a blocked arrangement across 128 threads where each thread owns 4 consecutive keys. The final partitioning is striped.
+
+`{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }`. The corresponding output `thread_keys` in those threads will be `{ [0,128,256,384], [1,129,257,385], [2,130,258,386], ..., [127,255,383,511] }`.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+__global__ void ExampleKernel(...)
+{
+ // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
+ using BlockRadixSort = cub::BlockRadixSort;
+
+ // Allocate shared memory for BlockRadixSort
+ __shared__ typename BlockRadixSort::TempStorage temp_storage;
+
+ // Obtain a segment of consecutive items that are blocked across threads
+ int thread_keys[4];
+ ...
+
+ // Collectively sort the keys
+ BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys);
+```
+
+
+
+
+Performs an ascending block-wide radix sort over a blocked arrangement of keys, leaving them in a striped arrangement.
+
+Let's consider a user-defined `custom_t` type below. To sort an array of `custom_t` objects, we have to tell CUB about relevant members of the `custom_t` type. We do this by providing a decomposer that returns a tuple of references to relevant members of the key.
+
+.. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu :language: c++ :dedent: :start-after: example-begin custom-type :end-before: example-end custom-type
+
+The code snippet below illustrates a sort of 4 keys that are partitioned in a blocked arrangement across 2 threads where each thread owns 2 consecutive keys. The final partitioning is striped.
+
+.. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu :language: c++ :dedent: :start-after: example-begin keys-striped-bits :end-before: example-end keys-striped-bits
+
+
+```cpp showLineNumbers={false}
+template
+::cuda::std::enable_if_t> cub::BlockRadixSort::SortBlockedToStriped(
+ KeyT (&keys)[ItemsPerThread],
+ DecomposerT decomposer,
+ int begin_bit,
+ int end_bit
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+* Performance is sensitive to the degree of data movement across the block. * The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Template parameters**
+
+
+**[inferred]** Type of a callable object responsible for decomposing a `KeyT` into a tuple of references to its constituent arithmetic types: `::cuda::std::tuple operator()(KeyT &key)`. The leftmost element of the tuple is considered the most significant. The call operator must not modify members of the key.
+
+
+**Parameters**
+
+
+Keys to sort
+
+
+
+Callable object responsible for decomposing a `KeyT` into a tuple of references to its constituent arithmetic types. The leftmost element of the tuple is considered the most significant. The call operator must not modify members of the key.
+
+
+
+The least-significant bit index (inclusive) needed for key comparison
+
+
+
+The most-significant bit index (exclusive) needed for key comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
+
+
+
+
+
+Performs an ascending block-wide radix sort over a blocked arrangement of keys, leaving them in a striped arrangement.
+
+Let's consider a user-defined `custom_t` type below. To sort an array of `custom_t` objects, we have to tell CUB about relevant members of the `custom_t` type. We do this by providing a decomposer that returns a tuple of references to relevant members of the key.
+
+.. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu :language: c++ :dedent: :start-after: example-begin custom-type :end-before: example-end custom-type
+
+The code snippet below illustrates a sort of 6 keys that are partitioned in a blocked arrangement across 2 threads where each thread owns 3 consecutive keys. The final partitioning is striped.
+
+.. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu :language: c++ :dedent: :start-after: example-begin keys-striped :end-before: example-end keys-striped
+
+
+```cpp showLineNumbers={false}
+template
+::cuda::std::enable_if_t> cub::BlockRadixSort::SortBlockedToStriped(
+ KeyT (&keys)[ItemsPerThread],
+ DecomposerT decomposer
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+* Performance is sensitive to the degree of data movement across the block. * The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Template parameters**
+
+
+**[inferred]** Type of a callable object responsible for decomposing a `KeyT` into a tuple of references to its constituent arithmetic types: `::cuda::std::tuple operator()(KeyT &key)`. The leftmost element of the tuple is considered the most significant. The call operator must not modify members of the key.
+
+
+**Parameters**
+
+
+Keys to sort
+
+
+
+Callable object responsible for decomposing a `KeyT` into a tuple of references to its constituent arithmetic types. The leftmost element of the tuple is considered the most significant. The call operator must not modify members of the key.
+
+
+
+
+
+Performs an ascending radix sort across a blocked arrangement of keys and values, leaving them in a striped arrangement.
+
+
+```cpp showLineNumbers={false}
+void cub::BlockRadixSort::SortBlockedToStriped(
+ KeyT (&keys)[ItemsPerThread],
+ ValueT (&values)[ItemsPerThread],
+ int begin_bit = 0,
+ int end_bit = sizeof(KeyT) *8
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+BlockRadixSort can only accommodate one associated tile of values. To "truck along" more than one tile of values, simply perform a key-value sort of the keys paired with a temporary value array that enumerates the key indices. The reordered indices can then be used as a gather-vector for exchanging other associated tile data through shared memory.
+Performance is sensitive to the degree of data movement across the block.
+
+
+
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Parameters**
+
+
+Keys to sort
+
+
+
+Values to sort
+
+
+
+**[optional]** The beginning (least-significant) bit index needed for key comparison
+
+
+
+**[optional]** The past-the-end (most-significant) bit index needed for key comparison
+
+
+**Example**
+
+The code snippet below illustrates a sort of 512 integer keys and values that are initially partitioned in a [blocked arrangement](../index.html#sec5sec3) across 128 threads where each thread owns 4 consecutive pairs. The final partitioning is striped.
+
+`{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }`. The corresponding output `thread_keys` in those threads will be `{ [0,128,256,384], [1,129,257,385], [2,130,258,386], ..., [127,255,383,511] }`.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+__global__ void ExampleKernel(...)
+{
+ // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each
+ using BlockRadixSort = cub::BlockRadixSort;
+
+ // Allocate shared memory for BlockRadixSort
+ __shared__ typename BlockRadixSort::TempStorage temp_storage;
+
+ // Obtain a segment of consecutive items that are blocked across threads
+ int thread_keys[4];
+ int thread_values[4];
+ ...
+
+ // Collectively sort the keys and values among block threads
+ BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys, thread_values);
+```
+
+
+
+
+Performs an ascending block-wide radix sort over a blocked arrangement of keys and values, leaving them in a striped arrangement.
+
+Let's consider a user-defined `custom_t` type below. To sort an array of `custom_t` objects, we have to tell CUB about relevant members of the `custom_t` type. We do this by providing a decomposer that returns a tuple of references to relevant members of the key.
+
+.. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu :language: c++ :dedent: :start-after: example-begin custom-type :end-before: example-end custom-type
+
+The code snippet below illustrates a sort of 4 pairs that are partitioned in a blocked arrangement across 2 threads where each thread owns 2 consecutive pairs. The final partitioning is striped.
+
+.. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu :language: c++ :dedent: :start-after: example-begin pairs-striped-bits :end-before: example-end pairs-striped-bits
+
+
+```cpp showLineNumbers={false}
+template
+::cuda::std::enable_if_t> cub::BlockRadixSort::SortBlockedToStriped(
+ KeyT (&keys)[ItemsPerThread],
+ ValueT (&values)[ItemsPerThread],
+ DecomposerT decomposer,
+ int begin_bit,
+ int end_bit
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+* Performance is sensitive to the degree of data movement across the block. * The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Template parameters**
+
+
+**[inferred]** Type of a callable object responsible for decomposing a `KeyT` into a tuple of references to its constituent arithmetic types: `::cuda::std::tuple operator()(KeyT &key)`. The leftmost element of the tuple is considered the most significant. The call operator must not modify members of the key.
+
+
+**Parameters**
+
+
+Keys to sort
+
+
+
+Values to sort
+
+
+
+Callable object responsible for decomposing a `KeyT` into a tuple of references to its constituent arithmetic types. The leftmost element of the tuple is considered the most significant. The call operator must not modify members of the key.
+
+
+
+The least-significant bit index (inclusive) needed for key comparison
+
+
+
+The most-significant bit index (exclusive) needed for key comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
+
+
+
+
+
+Performs an ascending block-wide radix sort over a blocked arrangement of keys and values, leaving them in a striped arrangement.
+
+Let's consider a user-defined `custom_t` type below. To sort an array of `custom_t` objects, we have to tell CUB about relevant members of the `custom_t` type. We do this by providing a decomposer that returns a tuple of references to relevant members of the key.
+
+.. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu :language: c++ :dedent: :start-after: example-begin custom-type :end-before: example-end custom-type
+
+The code snippet below illustrates a sort of 6 pairs that are partitioned in a blocked arrangement across 2 threads where each thread owns 3 consecutive pairs. The final partitioning is striped.
+
+.. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu :language: c++ :dedent: :start-after: example-begin pairs-striped :end-before: example-end pairs-striped
+
+
+```cpp showLineNumbers={false}
+template
+::cuda::std::enable_if_t> cub::BlockRadixSort::SortBlockedToStriped(
+ KeyT (&keys)[ItemsPerThread],
+ ValueT (&values)[ItemsPerThread],
+ DecomposerT decomposer
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+* Performance is sensitive to the degree of data movement across the block. * The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Template parameters**
+
+
+**[inferred]** Type of a callable object responsible for decomposing a `KeyT` into a tuple of references to its constituent arithmetic types: `::cuda::std::tuple operator()(KeyT &key)`. The leftmost element of the tuple is considered the most significant. The call operator must not modify members of the key.
+
+
+**Parameters**
+
+
+Keys to sort
+
+
+
+Values to sort
+
+
+
+Callable object responsible for decomposing a `KeyT` into a tuple of references to its constituent arithmetic types. The leftmost element of the tuple is considered the most significant. The call operator must not modify members of the key.
+
+
+
+
+
+### SortDescendingBlockedToStriped inline
+
+
+
+
+Performs a descending radix sort across a blocked arrangement of keys, leaving them in a striped arrangement.
+
+
+```cpp showLineNumbers={false}
+void cub::BlockRadixSort::SortDescendingBlockedToStriped(
+ KeyT (&keys)[ItemsPerThread],
+ int begin_bit = 0,
+ int end_bit = sizeof(KeyT) *8
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+Performance is sensitive to the degree of data movement across the block.
+
+
+
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Parameters**
+
+
+Keys to sort
+
+
+
+**[optional]** The beginning (least-significant) bit index needed for key comparison
+
+
+
+**[optional]** The past-the-end (most-significant) bit index needed for key comparison
+
+
+**Example**
+
+The code snippet below illustrates a sort of 512 integer keys that are initially partitioned in a blocked arrangement across 128 threads where each thread owns 4 consecutive keys. The final partitioning is striped.
+
+`{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }`. The corresponding output `thread_keys` in those threads will be `{ [511,383,255,127], [386,258,130,2], [385,257,128,1], ..., [384,256,128,0] }`.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+__global__ void ExampleKernel(...)
+{
+ // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
+ using BlockRadixSort = cub::BlockRadixSort;
+
+ // Allocate shared memory for BlockRadixSort
+ __shared__ typename BlockRadixSort::TempStorage temp_storage;
+
+ // Obtain a segment of consecutive items that are blocked across threads
+ int thread_keys[4];
+ ...
+
+ // Collectively sort the keys
+ BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys);
+```
+
+
+
+
+Performs a descending block-wide radix sort over a blocked arrangement of keys, leaving them in a striped arrangement.
+
+Let's consider a user-defined `custom_t` type below. To sort an array of `custom_t` objects, we have to tell CUB about relevant members of the `custom_t` type. We do this by providing a decomposer that returns a tuple of references to relevant members of the key.
+
+.. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu :language: c++ :dedent: :start-after: example-begin custom-type :end-before: example-end custom-type
+
+The code snippet below illustrates a sort of 4 keys that are partitioned in a blocked arrangement across 2 threads where each thread owns 2 consecutive keys. The final partitioning is striped.
+
+.. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu :language: c++ :dedent: :start-after: example-begin keys-striped-descending-bits :end-before: example-end keys-striped-descending-bits
+
+
+```cpp showLineNumbers={false}
+template
+::cuda::std::enable_if_t> cub::BlockRadixSort::SortDescendingBlockedToStriped(
+ KeyT (&keys)[ItemsPerThread],
+ DecomposerT decomposer,
+ int begin_bit,
+ int end_bit
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+* Performance is sensitive to the degree of data movement across the block. * The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Template parameters**
+
+
+**[inferred]** Type of a callable object responsible for decomposing a `KeyT` into a tuple of references to its constituent arithmetic types: `::cuda::std::tuple operator()(KeyT &key)`. The leftmost element of the tuple is considered the most significant. The call operator must not modify members of the key.
+
+
+**Parameters**
+
+
+Keys to sort
+
+
+
+Callable object responsible for decomposing a `KeyT` into a tuple of references to its constituent arithmetic types. The leftmost element of the tuple is considered the most significant. The call operator must not modify members of the key.
+
+
+
+The least-significant bit index (inclusive) needed for key comparison
+
+
+
+The most-significant bit index (exclusive) needed for key comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
+
+
+
+
+
+Performs a descending block-wide radix sort over a blocked arrangement of keys, leaving them in a striped arrangement.
+
+Let's consider a user-defined `custom_t` type below. To sort an array of `custom_t` objects, we have to tell CUB about relevant members of the `custom_t` type. We do this by providing a decomposer that returns a tuple of references to relevant members of the key.
+
+.. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu :language: c++ :dedent: :start-after: example-begin custom-type :end-before: example-end custom-type
+
+The code snippet below illustrates a sort of 6 keys that are partitioned in a blocked arrangement across 2 threads where each thread owns 3 consecutive keys. The final partitioning is striped.
+
+.. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu :language: c++ :dedent: :start-after: example-begin keys-striped-descending :end-before: example-end keys-striped-descending
+
+
+```cpp showLineNumbers={false}
+template
+::cuda::std::enable_if_t> cub::BlockRadixSort::SortDescendingBlockedToStriped(
+ KeyT (&keys)[ItemsPerThread],
+ DecomposerT decomposer
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+* Performance is sensitive to the degree of data movement across the block. * The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Template parameters**
+
+
+**[inferred]** Type of a callable object responsible for decomposing a `KeyT` into a tuple of references to its constituent arithmetic types: `::cuda::std::tuple operator()(KeyT &key)`. The leftmost element of the tuple is considered the most significant. The call operator must not modify members of the key.
+
+
+**Parameters**
+
+
+Keys to sort
+
+
+
+Callable object responsible for decomposing a `KeyT` into a tuple of references to its constituent arithmetic types. The leftmost element of the tuple is considered the most significant. The call operator must not modify members of the key.
+
+
+
+
+
+Performs a descending radix sort across a blocked arrangement of keys and values, leaving them in a striped arrangement
+
+
+```cpp showLineNumbers={false}
+void cub::BlockRadixSort::SortDescendingBlockedToStriped(
+ KeyT (&keys)[ItemsPerThread],
+ ValueT (&values)[ItemsPerThread],
+ int begin_bit = 0,
+ int end_bit = sizeof(KeyT) *8
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+BlockRadixSort can only accommodate one associated tile of values. To "truck along" more than one tile of values, simply perform a key-value sort of the keys paired with a temporary value array that enumerates the key indices. The reordered indices can then be used as a gather-vector for exchanging other associated tile data through shared memory.
+Performance is sensitive to the degree of data movement across the block.
+
+
+
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Parameters**
+
+
+Keys to sort
+
+
+
+Values to sort
+
+
+
+**[optional]** The beginning (least-significant) bit index needed for key comparison
+
+
+
+**[optional]** The past-the-end (most-significant) bit index needed for key comparison
+
+
+**Example**
+
+The code snippet below illustrates a sort of 512 integer keys and values that are initially partitioned in a blocked arrangement across 128 threads where each thread owns 4 consecutive pairs. The final partitioning is striped.
+
+`{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }`. The corresponding output `thread_keys` in those threads will be `{ [511,383,255,127], [386,258,130,2], [385,257,128,1], ..., [384,256,128,0] }`.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+__global__ void ExampleKernel(...)
+{
+ // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each
+ using BlockRadixSort = cub::BlockRadixSort;
+
+ // Allocate shared memory for BlockRadixSort
+ __shared__ typename BlockRadixSort::TempStorage temp_storage;
+
+ // Obtain a segment of consecutive items that are blocked across threads
+ int thread_keys[4];
+ int thread_values[4];
+ ...
+
+ // Collectively sort the keys and values among block threads
+ BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys, thread_values);
+```
+
+
+
+
+Performs a descending block-wide radix sort over a blocked arrangement of keys and values, leaving them in a striped arrangement.
+
+Let's consider a user-defined `custom_t` type below. To sort an array of `custom_t` objects, we have to tell CUB about relevant members of the `custom_t` type. We do this by providing a decomposer that returns a tuple of references to relevant members of the key.
+
+.. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu :language: c++ :dedent: :start-after: example-begin custom-type :end-before: example-end custom-type
+
+The code snippet below illustrates a sort of 4 keys and values that are partitioned in a blocked arrangement across 2 threads where each thread owns 2 consecutive pairs. The final partitioning is striped.
+
+.. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu :language: c++ :dedent: :start-after: example-begin pairs-striped-descending-bits :end-before: example-end pairs-striped-descending-bits
+
+
+```cpp showLineNumbers={false}
+template
+::cuda::std::enable_if_t> cub::BlockRadixSort::SortDescendingBlockedToStriped(
+ KeyT (&keys)[ItemsPerThread],
+ ValueT (&values)[ItemsPerThread],
+ DecomposerT decomposer,
+ int begin_bit,
+ int end_bit
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+* Performance is sensitive to the degree of data movement across the block. * The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Template parameters**
+
+
+**[inferred]** Type of a callable object responsible for decomposing a `KeyT` into a tuple of references to its constituent arithmetic types: `::cuda::std::tuple operator()(KeyT &key)`. The leftmost element of the tuple is considered the most significant. The call operator must not modify members of the key.
+
+
+**Parameters**
+
+
+Keys to sort
+
+
+
+Values to sort
+
+
+
+Callable object responsible for decomposing a `KeyT` into a tuple of references to its constituent arithmetic types. The leftmost element of the tuple is considered the most significant. The call operator must not modify members of the key.
+
+
+
+The least-significant bit index (inclusive) needed for key comparison
+
+
+
+The most-significant bit index (exclusive) needed for key comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
+
+
+
+
+
+Performs a descending block-wide radix sort over a blocked arrangement of keys and values, leaving them in a striped arrangement.
+
+Let's consider a user-defined `custom_t` type below. To sort an array of `custom_t` objects, we have to tell CUB about relevant members of the `custom_t` type. We do this by providing a decomposer that returns a tuple of references to relevant members of the key.
+
+.. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu :language: c++ :dedent: :start-after: example-begin custom-type :end-before: example-end custom-type
+
+The code snippet below illustrates a sort of 6 keys and values that are partitioned in a blocked arrangement across 2 threads where each thread owns 3 consecutive pairs. The final partitioning is striped.
+
+.. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu :language: c++ :dedent: :start-after: example-begin pairs-striped-descending :end-before: example-end pairs-striped-descending
+
+
+```cpp showLineNumbers={false}
+template
+::cuda::std::enable_if_t> cub::BlockRadixSort::SortDescendingBlockedToStriped(
+ KeyT (&keys)[ItemsPerThread],
+ ValueT (&values)[ItemsPerThread],
+ DecomposerT decomposer
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+* Performance is sensitive to the degree of data movement across the block. * The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Template parameters**
+
+
+**[inferred]** Type of a callable object responsible for decomposing a `KeyT` into a tuple of references to its constituent arithmetic types: `::cuda::std::tuple operator()(KeyT &key)`. The leftmost element of the tuple is considered the most significant. The call operator must not modify members of the key.
+
+
+**Parameters**
+
+
+Keys to sort
+
+
+
+Values to sort
+
+
+
+Callable object responsible for decomposing a `KeyT` into a tuple of references to its constituent arithmetic types. The leftmost element of the tuple is considered the most significant. The call operator must not modify members of the key.
+
+
+
+
+
+---
+
+## Utility methods
+
+### PrivateStorage inline
+
+Internal storage allocator.
+
+
+```cpp showLineNumbers={false}
+_TempStorage & cub::BlockRadixSort::PrivateStorage()
+```
+
+
+### RankKeys inline
+
+
+
+
+Rank keys (specialized for ascending sort).
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockRadixSort::RankKeys(
+ bit_ordered_type (&unsigned_keys)[ItemsPerThread],
+ int (&ranks)[ItemsPerThread],
+ DigitExtractorT digit_extractor,
+ ::cuda::std::false_type
+)
+```
+
+
+
+
+
+Rank keys (specialized for descending sort).
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockRadixSort::RankKeys(
+ bit_ordered_type (&unsigned_keys)[ItemsPerThread],
+ int (&ranks)[ItemsPerThread],
+ DigitExtractorT digit_extractor,
+ ::cuda::std::true_type
+)
+```
+
+
+
+
+
+### ExchangeValues inline
+
+
+
+
+ExchangeValues (specialized for key-value sort, to-blocked arrangement).
+
+
+```cpp showLineNumbers={false}
+void cub::BlockRadixSort::ExchangeValues(
+ ValueT (&values)[ItemsPerThread],
+ int (&ranks)[ItemsPerThread],
+ ::cuda::std::false_type,
+ ::cuda::std::true_type
+)
+```
+
+
+
+
+
+ExchangeValues (specialized for key-value sort, to-striped arrangement).
+
+
+```cpp showLineNumbers={false}
+void cub::BlockRadixSort::ExchangeValues(
+ ValueT (&values)[ItemsPerThread],
+ int (&ranks)[ItemsPerThread],
+ ::cuda::std::false_type,
+ ::cuda::std::false_type
+)
+```
+
+
+
+
+
+ExchangeValues (specialized for keys-only sort).
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockRadixSort::ExchangeValues(
+ ValueT (&)[ItemsPerThread],
+ int (&)[ItemsPerThread],
+ ::cuda::std::true_type,
+ ::cuda::std::bool_constant
+)
+```
+
+
+
+
+
+### SortBlocked inline
+
+Sort blocked arrangement.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockRadixSort::SortBlocked(
+ KeyT (&keys)[ItemsPerThread],
+ ValueT (&values)[ItemsPerThread],
+ int begin_bit,
+ int end_bit,
+ ::cuda::std::bool_constant is_descending,
+ ::cuda::std::bool_constant is_keys_only,
+ DecomposerT decomposer = {}
+)
+```
+
+
+**Parameters**
+
+
+Keys to sort
+
+
+
+Values to sort
+
+
+
+The beginning (least-significant) bit index needed for key comparison
+
+
+
+The past-the-end (most-significant) bit index needed for key comparison
+
+
+
+Tag whether is a descending-order sort
+
+
+
+Tag whether is keys-only sort
+
+
+---
+
+## Types
+
+### Typedefs
+
+| Name | Definition | Description |
+|---|---|---|
+| `traits` | `detail::radix::traits_t< KeyT >` | |
+| `bit_ordered_type` | `typename traits::bit_ordered_type` | |
+| `bit_ordered_conversion` | `typename traits::bit_ordered_conversion_policy` | |
+| `AscendingBlockRadixRank` | `BlockRadixRank< BlockDimX, RadixBits, false, MemoizeOuterScan, InnerScanAlgorithm, SMemConfig, BlockDimY, BlockDimZ >` | Ascending [BlockRadixRank](/library/api/cub::_block_radix_rank) utility type. |
+| `DescendingBlockRadixRank` | `BlockRadixRank< BlockDimX, RadixBits, true, MemoizeOuterScan, InnerScanAlgorithm, SMemConfig, BlockDimY, BlockDimZ >` | Descending [BlockRadixRank](/library/api/cub::_block_radix_rank) utility type. |
+| `fundamental_digit_extractor_t` | `BFEDigitExtractor< KeyT >` | Digit extractor type. |
+| `BlockExchangeKeys` | `BlockExchange< KeyT, BlockDimX, ItemsPerThread, false, BlockDimY, BlockDimZ >` | [BlockExchange](/library/api/cub::_block_exchange) utility type for keys. |
+| `BlockExchangeValues` | `BlockExchange< ValueT, BlockDimX, ItemsPerThread, false, BlockDimY, BlockDimZ >` | [BlockExchange](/library/api/cub::_block_exchange) utility type for values. |
+
+---
+
+## Member variables
+
+| Name | Type | Description |
+|---|---|---|
+| `BLOCK_THREADS` static constexpr | `int` | |
+| `KEYS_ONLY` static constexpr | `bool` | |
+| `temp_storage` | `_TempStorage &` | Shared storage reference. |
+| `linear_tid` | `unsigned int` | Linear thread-id. |
+
+---
+
+## Inner classes
+
+### TempStorage
+
+
+```cpp showLineNumbers={false}
+struct cub::BlockRadixSort::TempStorage
+```
+
+
+The operations exposed by `BlockRadixSort` require a temporary memory allocation of this nested type for thread communication. This opaque storage can be allocated directly using the `__shared__` keyword. Alternatively, it can be aliased to externally allocated memory (shared or global) or `union`'d with other storage allocation types to facilitate memory reuse.
+
+**Inherits from:** `Uninitialized< _TempStorage >` (public)
diff --git a/fern/cudapages/cub/cub/cub/BlockRakingLayout.mdx b/fern/cudapages/cub/cub/cub/BlockRakingLayout.mdx
new file mode 100644
index 0000000..8417726
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/BlockRakingLayout.mdx
@@ -0,0 +1,96 @@
+---
+title: cub::BlockRakingLayout
+description: ""
+---
+
+BlockRakingLayout provides a conflict-free shared memory layout abstraction for 1D raking across thread block data.
+
+
+
+
+
+The data type to be exchanged.
+
+
+
+The thread block size in threads.
+
+
+
+
+
+---
+
+## Static methods
+
+### PlacementPtr inline static
+
+Returns the location for the calling thread to place data into the grid.
+
+
+```cpp showLineNumbers={false}
+static T * cub::BlockRakingLayout::PlacementPtr(
+ TempStorage &temp_storage,
+ unsigned int linear_tid
+)
+```
+
+
+### RakingPtr inline static
+
+Returns the location for the calling thread to begin sequential raking.
+
+
+```cpp showLineNumbers={false}
+static T * cub::BlockRakingLayout::RakingPtr(
+ TempStorage &temp_storage,
+ unsigned int linear_tid
+)
+```
+
+
+---
+
+## Member variables
+
+| Name | Type | Description |
+|---|---|---|
+| `SHARED_ELEMENTS` static constexpr | `int` | The total number of elements that need to be cooperatively reduced. |
+| `MAX_RAKING_THREADS` static constexpr | `int` | Maximum number of warp-synchronous raking threads. |
+| `SEGMENT_LENGTH` static constexpr | `int` | Number of raking elements per warp-synchronous raking thread (rounded up). |
+| `RAKING_THREADS` static constexpr | `int` | Never use a raking thread that will have no valid data (e.g., when BlockThreads is 62 and SEGMENT_LENGTH is 2, we should only use 31 raking threads). |
+| `HAS_CONFLICTS` static constexpr | `bool` | Whether we will have bank conflicts (technically we should find out if the GCD is > 1). |
+| `CONFLICT_DEGREE` static constexpr | `int` | Degree of bank conflicts (e.g., 4-way). |
+| `USE_SEGMENT_PADDING` static constexpr | `bool` | Pad each segment length with one element if segment length is not relatively prime to warp size and can't be optimized as a vector load. |
+| `GRID_ELEMENTS` static constexpr | `int` | Total number of elements in the raking grid. |
+| `UNGUARDED` static constexpr | `int` | Whether or not we need bounds checking during raking (the number of reduction elements is not a multiple of the number of raking threads). |
+
+---
+
+## Inner classes
+
+### _TempStorage
+
+
+```cpp showLineNumbers={false}
+struct cub::BlockRakingLayout::_TempStorage
+```
+
+
+Shared memory storage type.
+
+| Name | Type | Description |
+|---|---|---|
+| `buff` | `T` | |
+
+### TempStorage
+
+
+```cpp showLineNumbers={false}
+struct cub::BlockRakingLayout::TempStorage
+```
+
+
+Alias wrapper allowing storage to be unioned.
+
+**Inherits from:** `Uninitialized< _TempStorage >` (public)
diff --git a/fern/cudapages/cub/cub/cub/BlockReduce.mdx b/fern/cudapages/cub/cub/cub/BlockReduce.mdx
new file mode 100644
index 0000000..2dbf17d
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/BlockReduce.mdx
@@ -0,0 +1,565 @@
+---
+title: cub::BlockReduce
+description: ""
+---
+
+The BlockReduce class provides collective methods for computing a parallel reduction of items partitioned across a CUDA thread block.
+
+## Performance considerations
+
+- Performance is sensitive to the degree of data movement across the block.
+- Very efficient (only one synchronization barrier).
+- Incurs zero bank conflicts for most types
+- Computation is slightly more efficient (i.e., having lower instruction overhead) for:
+ - Summation (vs. generic reduction)
+ - `BLOCK_THREADS` is a multiple of the architecture's warp size
+ - Every thread has a valid input (i.e., full vs. partial-tiles)
+- See cub::BlockReduceAlgorithm for performance details regarding algorithmic alternatives
+
+## Example
+
+The code snippet below illustrates a sum reduction of 512 integer items that are partitioned in a blocked arrangement across 128 threads where each thread owns 4 consecutive items.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+__global__ void ExampleKernel(...)
+{
+ // Specialize BlockReduce for a 1D block of 128 threads of type int
+ using BlockReduce = cub::BlockReduce;
+
+ // Allocate shared memory for BlockReduce
+ __shared__ typename BlockReduce::TempStorage temp_storage;
+
+ // Obtain a segment of consecutive items that are blocked across threads
+ int thread_data[4];
+ ...
+
+ // Compute the block-wide sum for thread0
+ int aggregate = BlockReduce(temp_storage).Sum(thread_data);
+}
+```
+
+
+
+
+
+Data type being reduced
+
+
+
+The thread block length in threads along the X dimension
+
+
+
+**[optional]** [cub::BlockReduceAlgorithm](/library/api/cub::BlockReduceAlgorithm) enumerator specifying the underlying algorithm to use (default: [cub::BLOCK_REDUCE_WARP_REDUCTIONS](/library/api/cub::BLOCK_REDUCE_WARP_REDUCTIONS))
+
+
+
+**[optional]** The thread block length in threads along the Y dimension (default: 1)
+
+
+
+**[optional]** The thread block length in threads along the Z dimension (default: 1)
+
+
+
+
+
+---
+
+## Collective constructors
+
+### BlockReduce inline
+
+
+
+
+Collective constructor using a private static allocation of shared memory as temporary storage.
+
+
+```cpp showLineNumbers={false}
+cub::BlockReduce::BlockReduce()
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+
+
+Collective constructor using the specified memory allocation as temporary storage.
+
+
+```cpp showLineNumbers={false}
+cub::BlockReduce::BlockReduce(
+ TempStorage &temp_storage
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+**Parameters**
+
+
+Reference to memory allocation having layout type [TempStorage](/library/api/cub::BlockReduce::TempStorage)
+
+
+
+
+
+---
+
+## Generic reductions
+
+### Reduce inline
+
+
+
+
+Computes a block-wide reduction for thread0 using the specified binary reduction functor. Each thread contributes one input element.
+
+
+```cpp showLineNumbers={false}
+template
+T cub::BlockReduce::Reduce(
+ T input,
+ ReductionOp reduction_op
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+Assumes threads are in row-major order.
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+
+The return value is undefined in threads other than thread0.
+
+
+**Template parameters**
+
+
+**[inferred]** Binary reduction functor type having member `T operator()(const T &a, const T &b)`
+
+
+**Parameters**
+
+
+Calling thread's input
+
+
+
+Binary reduction functor
+
+
+**Example**
+
+The code snippet below illustrates a max reduction of 128 integer items that are partitioned across 128 threads.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+__global__ void ExampleKernel(...)
+{
+ // Specialize BlockReduce for a 1D block of 128 threads of type int
+ using BlockReduce = cub::BlockReduce;
+
+ // Allocate shared memory for BlockReduce
+ __shared__ typename BlockReduce::TempStorage temp_storage;
+
+ // Each thread obtains an input item
+ int thread_data;
+ ...
+
+ // Compute the block-wide max for thread0
+ int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cuda::maximum<>{});
+}
+```
+
+
+
+
+Computes a block-wide reduction for thread0 using the specified binary reduction functor. Each thread contributes an array of consecutive input elements.
+
+
+```cpp showLineNumbers={false}
+template
+T cub::BlockReduce::Reduce(
+ T (&inputs)[ITEMS_PER_THREAD],
+ ReductionOp reduction_op
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+Performance is sensitive to the degree of data movement across the block.
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+
+The return value is undefined in threads other than thread0.
+
+
+**Template parameters**
+
+
+**[inferred]** The number of consecutive items partitioned onto each thread.
+
+
+
+**[inferred]** Binary reduction functor type having member `T operator()(const T &a, const T &b)`
+
+
+**Parameters**
+
+
+Calling thread's input segment
+
+
+
+Binary reduction functor
+
+
+**Example**
+
+The code snippet below illustrates a max reduction of 512 integer items that are partitioned in a blocked arrangement across 128 threads where each thread owns 4 consecutive items.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+__global__ void ExampleKernel(...)
+{
+ // Specialize BlockReduce for a 1D block of 128 threads of type int
+ using BlockReduce = cub::BlockReduce;
+
+ // Allocate shared memory for BlockReduce
+ __shared__ typename BlockReduce::TempStorage temp_storage;
+
+ // Obtain a segment of consecutive items that are blocked across threads
+ int thread_data[4];
+ ...
+
+ // Compute the block-wide max for thread0
+ int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cuda::maximum<>{});
+}
+```
+
+
+
+
+Computes a block-wide reduction for thread0 using the specified binary reduction functor. The first `num_valid` threads each contribute one input element.
+
+
+```cpp showLineNumbers={false}
+template
+T cub::BlockReduce::Reduce(
+ T input,
+ ReductionOp reduction_op,
+ int num_valid
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+Assumes threads are in row-major order.
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+
+The return value is undefined in threads other than thread0.
+
+
+**Template parameters**
+
+
+**[inferred]** Binary reduction functor type having member `T operator()(const T &a, const T &b)`
+
+
+**Parameters**
+
+
+Calling thread's input
+
+
+
+Binary reduction functor
+
+
+
+Number of threads containing valid elements (may be less than BLOCK_THREADS)
+
+
+**Example**
+
+The code snippet below illustrates a max reduction of a partially-full tile of integer items that are partitioned across 128 threads.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+__global__ void ExampleKernel(int num_valid, ...)
+{
+ // Specialize BlockReduce for a 1D block of 128 threads of type int
+ using BlockReduce = cub::BlockReduce;
+
+ // Allocate shared memory for BlockReduce
+ __shared__ typename BlockReduce::TempStorage temp_storage;
+
+ // Each thread obtains an input item
+ int thread_data;
+ if (threadIdx.x < num_valid) thread_data = ...
+
+ // Compute the block-wide max for thread0
+ int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cuda::maximum<>{}, num_valid);
+}
+```
+
+
+
+
+---
+
+## Summation reductions
+
+### Sum inline
+
+
+
+
+Computes a block-wide reduction for thread0 using addition (+) as the reduction operator. Each thread contributes one input element.
+
+
+```cpp showLineNumbers={false}
+T cub::BlockReduce::Sum(
+ T input
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+Assumes threads are in row-major order.
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+
+The return value is undefined in threads other than thread0.
+
+
+**Parameters**
+
+
+Calling thread's input
+
+
+**Example**
+
+The code snippet below illustrates a sum reduction of 128 integer items that are partitioned across 128 threads.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+__global__ void ExampleKernel(...)
+{
+ // Specialize BlockReduce for a 1D block of 128 threads of type int
+ using BlockReduce = cub::BlockReduce;
+
+ // Allocate shared memory for BlockReduce
+ __shared__ typename BlockReduce::TempStorage temp_storage;
+
+ // Each thread obtains an input item
+ int thread_data;
+ ...
+
+ // Compute the block-wide sum for thread0
+ int aggregate = BlockReduce(temp_storage).Sum(thread_data);
+}
+```
+
+
+
+
+Computes a block-wide reduction for thread0 using addition (+) as the reduction operator. Each thread contributes an array of consecutive input elements.
+
+
+```cpp showLineNumbers={false}
+template
+T cub::BlockReduce::Sum(
+ T (&inputs)[ITEMS_PER_THREAD]
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+Performance is sensitive to the degree of data movement across the block.
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+
+The return value is undefined in threads other than thread0.
+
+
+**Template parameters**
+
+
+**[inferred]** The number of consecutive items partitioned onto each thread.
+
+
+**Parameters**
+
+
+Calling thread's input segment
+
+
+**Example**
+
+The code snippet below illustrates a sum reduction of 512 integer items that are partitioned in a blocked arrangement across 128 threads where each thread owns 4 consecutive items.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+__global__ void ExampleKernel(...)
+{
+ // Specialize BlockReduce for a 1D block of 128 threads of type int
+ using BlockReduce = cub::BlockReduce;
+
+ // Allocate shared memory for BlockReduce
+ __shared__ typename BlockReduce::TempStorage temp_storage;
+
+ // Obtain a segment of consecutive items that are blocked across threads
+ int thread_data[4];
+ ...
+
+ // Compute the block-wide sum for thread0
+ int aggregate = BlockReduce(temp_storage).Sum(thread_data);
+}
+```
+
+
+
+
+Computes a block-wide reduction for thread0 using addition (+) as the reduction operator. The first `num_valid` threads each contribute one input element.
+
+
+```cpp showLineNumbers={false}
+T cub::BlockReduce::Sum(
+ T input,
+ int num_valid
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+Assumes threads are in row-major order.
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+
+The return value is undefined in threads other than thread0.
+
+
+**Parameters**
+
+
+Calling thread's input
+
+
+
+Number of threads containing valid elements (may be less than BLOCK_THREADS)
+
+
+**Example**
+
+The code snippet below illustrates a sum reduction of a partially-full tile of integer items that are partitioned across 128 threads.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+__global__ void ExampleKernel(int num_valid, ...)
+{
+ // Specialize BlockReduce for a 1D block of 128 threads of type int
+ using BlockReduce = cub::BlockReduce;
+
+ // Allocate shared memory for BlockReduce
+ __shared__ typename BlockReduce::TempStorage temp_storage;
+
+ // Each thread obtains an input item (up to num_items)
+ int thread_data;
+ if (threadIdx.x < num_valid)
+ thread_data = ...
+
+ // Compute the block-wide sum for thread0
+ int aggregate = BlockReduce(temp_storage).Sum(thread_data, num_valid);
+}
+```
+
+
+
+
+---
+
+## Utility methods
+
+### PrivateStorage inline
+
+Internal storage allocator.
+
+
+```cpp showLineNumbers={false}
+_TempStorage & cub::BlockReduce::PrivateStorage()
+```
+
+
+---
+
+## Types
+
+### Typedefs
+
+| Name | Definition | Description |
+|---|---|---|
+| `WarpReductions` | `detail::BlockReduceWarpReductions< T, BlockDimX, BlockDimY, BlockDimZ >` | |
+| `WarpReductionsNondeterministic` | `detail::BlockReduceWarpReductions< T, BlockDimX, BlockDimY, BlockDimZ, false >` | |
+| `RakingCommutativeOnly` | `detail::BlockReduceRakingCommutativeOnly< T, BlockDimX, BlockDimY, BlockDimZ >` | |
+| `Raking` | `detail::BlockReduceRaking< T, BlockDimX, BlockDimY, BlockDimZ >` | |
+| `InternalBlockReduce` | `::cuda::std::_If< Algorithm==BLOCK_REDUCE_WARP_REDUCTIONS, WarpReductions, ::cuda::std::_If< Algorithm==BLOCK_REDUCE_WARP_REDUCTIONS_NONDETERMINISTIC, WarpReductionsNondeterministic, ::cuda::std::_If< Algorithm==BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY, RakingCommutativeOnly, Raking > > >` | Internal specialization type. |
+| `_TempStorage` | `typename InternalBlockReduce::TempStorage` | Shared memory storage layout type for `BlockReduce`. |
+
+---
+
+## Member variables
+
+| Name | Type | Description |
+|---|---|---|
+| `BLOCK_THREADS` static constexpr | `int` | The thread block size in threads. |
+| `temp_storage` | `_TempStorage &` | Shared storage reference. |
+| `linear_tid` | `unsigned int` | Linear thread-id. |
+
+---
+
+## Inner classes
+
+### TempStorage
+
+
+```cpp showLineNumbers={false}
+struct cub::BlockReduce::TempStorage
+```
+
+
+The operations exposed by `BlockReduce` require a temporary memory allocation of this nested type for thread communication. This opaque storage can be allocated directly using the `__shared__` keyword. Alternatively, it can be aliased to externally allocated memory (shared or global) or `union`'d with other storage allocation types to facilitate memory reuse.
+
+**Inherits from:** `Uninitialized< _TempStorage >` (public)
diff --git a/fern/cudapages/cub/cub/cub/BlockRunLengthDecode.mdx b/fern/cudapages/cub/cub/cub/BlockRunLengthDecode.mdx
new file mode 100644
index 0000000..e16de79
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/BlockRunLengthDecode.mdx
@@ -0,0 +1,296 @@
+---
+title: cub::BlockRunLengthDecode
+description: ""
+---
+
+The BlockRunLengthDecode class supports decoding a run-length encoded array of items. That is, given the two arrays `run_value[N]` and `run_lengths[N]`, `run_value[i]` is repeated `run_lengths[i]` many times in the output array. Due to the nature of the run-length decoding algorithm ("decompression"), the output size of the run-length decoded array is runtime-dependent and potentially without any upper bound. To address this, BlockRunLengthDecode allows retrieving a "window" from the run-length decoded array. The window's offset can be specified and BLOCK_THREADS * DecodedItemsPerThread (i.e., referred to as window_size) decoded items from the specified window will be returned.
+
+.. note::
+
+ Trailing runs of length 0 are supported (i.e., they may only appear at the end of the run_lengths array). A run of length zero may not be followed by a run length that is not zero.
+
+Suppose the set of input `run_values` across the block of threads is `{ [0, 1], [2, 3], [4, 5], [6, 7], ..., [254, 255] }` and `run_lengths` is `{ [1, 2], [3, 4], [5, 1], [2, 3], ..., [5, 1] }`. The corresponding output `decoded_items` in those threads will be `{ [0, 1, 1, 2], [2, 2, 3, 3], [3, 3, 4, 4], [4, 4, 4, 5], ..., [169, 169, 170, 171] }` and `relative_offsets` will be `{ [0, 0, 1, 0], [1, 2, 0, 1], [2, 3, 0, 1], [2, 3, 4, 0], ..., [3, 4, 0, 0] }` during the first iteration of the while loop.
+
+
+
+
+
+The data type of the items being run-length decoded
+
+
+
+The thread block length in threads along the X dimension
+
+
+
+The number of consecutive runs that each thread contributes
+
+
+
+The maximum number of decoded items that each thread holds
+
+
+
+Type used to index into the block's decoded items (large enough to hold the sum over all the runs' lengths)
+
+
+
+The thread block length in threads along the Y dimension
+
+
+
+The thread block length in threads along the Z dimension
+
+
+
+
+
+---
+
+## Constructors
+
+### BlockRunLengthDecode inline
+
+
+
+
+Constructor specialised for user-provided temporary storage, initializing using the runs' lengths.
+
+
+```cpp showLineNumbers={false}
+template
+cub::BlockRunLengthDecode::BlockRunLengthDecode(
+ TempStorage &temp_storage,
+ ItemT (&run_values)[RunsPerThread],
+ RunLengthT (&run_lengths)[RunsPerThread],
+ TotalDecodedSizeT &total_decoded_size
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+
+
+Constructor specialised for user-provided temporary storage, initializing using the runs' offsets.
+
+
+```cpp showLineNumbers={false}
+template
+cub::BlockRunLengthDecode::BlockRunLengthDecode(
+ TempStorage &temp_storage,
+ ItemT (&run_values)[RunsPerThread],
+ UserRunOffsetT (&run_offsets)[RunsPerThread]
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+
+
+Constructor specialised for static temporary storage, initializing using the runs' lengths.
+
+
+```cpp showLineNumbers={false}
+template
+cub::BlockRunLengthDecode::BlockRunLengthDecode(
+ ItemT (&run_values)[RunsPerThread],
+ RunLengthT (&run_lengths)[RunsPerThread],
+ TotalDecodedSizeT &total_decoded_size
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+
+
+Constructor specialised for static temporary storage, initializing using the runs' offsets.
+
+
+```cpp showLineNumbers={false}
+template
+cub::BlockRunLengthDecode::BlockRunLengthDecode(
+ ItemT (&run_values)[RunsPerThread],
+ UserRunOffsetT (&run_offsets)[RunsPerThread]
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+
+
+---
+
+## Methods
+
+### PrivateStorage inline
+
+Internal storage allocator (used when the user does not provide pre-allocated shared memory).
+
+
+```cpp showLineNumbers={false}
+_TempStorage & cub::BlockRunLengthDecode::PrivateStorage()
+```
+
+
+### StaticUpperBound inline
+
+Returns the offset of the first value within `input` which compares greater than `val`.
+
+This version takes `MAX_NUM_ITEMS`, an upper bound of the array size, which will be used to determine the number of binary search iterations at compile time.
+
+
+```cpp showLineNumbers={false}
+template
+OffsetT cub::BlockRunLengthDecode::StaticUpperBound(
+ InputIteratorT input,
+ OffsetT num_items,
+ T val
+)
+```
+
+
+**Parameters**
+
+
+Input sequence
+
+
+
+Input sequence length
+
+
+
+Search key
+
+
+### InitWithRunOffsets inline
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockRunLengthDecode::InitWithRunOffsets(
+ ItemT (&run_values)[RunsPerThread],
+ RunOffsetT (&run_offsets)[RunsPerThread]
+)
+```
+
+
+### InitWithRunLengths inline
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockRunLengthDecode::InitWithRunLengths(
+ ItemT (&run_values)[RunsPerThread],
+ RunLengthT (&run_lengths)[RunsPerThread],
+ TotalDecodedSizeT &total_decoded_size
+)
+```
+
+
+### RunLengthDecode inline
+
+
+
+
+Run-length decodes the runs previously passed via a call to Init(...) and returns the run-length decoded items in a blocked arrangement to `decoded_items`.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockRunLengthDecode::RunLengthDecode(
+ ItemT (&decoded_items)[DecodedItemsPerThread],
+ RelativeOffsetT (&item_offsets)[DecodedItemsPerThread],
+ DecodedOffsetT from_decoded_offset = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+**Parameters**
+
+
+The run-length decoded items to be returned in a blocked arrangement
+
+
+
+The run-length decoded items' relative offset within the run they belong to
+
+
+
+If invoked with from_decoded_offset that is larger than total_decoded_size results in undefined behavior.
+
+
+
+
+
+Run-length decodes the runs previously passed via a call to Init(...) and returns the run-length decoded items in a blocked arrangement to `decoded_items`.
+
+
+```cpp showLineNumbers={false}
+void cub::BlockRunLengthDecode::RunLengthDecode(
+ ItemT (&decoded_items)[DecodedItemsPerThread],
+ DecodedOffsetT from_decoded_offset = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+**Parameters**
+
+
+The run-length decoded items to be returned in a blocked arrangement
+
+
+
+If invoked with from_decoded_offset that is larger than total_decoded_size results in undefined behavior.
+
+
+
+
+
+---
+
+## Types
+
+### Typedefs
+
+| Name | Definition | Description |
+|---|---|---|
+| `RunOffsetScanT` | `BlockScan< DecodedOffsetT, BlockDimX, BLOCK_SCAN_RAKING_MEMOIZE, BlockDimY, BlockDimZ >` | [BlockScan](/library/api/cub::_block_scan) used to determine the beginning of each run (i.e., prefix sum over the runs' length). |
+| `RunOffsetT` | `uint32_t` | Type used to index into the block's runs. |
+
+---
+
+## Member variables
+
+| Name | Type | Description |
+|---|---|---|
+| `BLOCK_THREADS` static constexpr | `int` | The thread block size in threads. |
+| `BLOCK_RUNS` static constexpr | `int` | The number of runs that the block decodes (out-of-bounds items may be padded with run lengths of '0'). |
+| `temp_storage` | `_TempStorage &` | Shared storage reference. |
+| `linear_tid` | `uint32_t` | Linear thread-id. |
+
+---
+
+## Inner classes
+
+### TempStorage
+
+
+```cpp showLineNumbers={false}
+struct cub::BlockRunLengthDecode::TempStorage
+```
+
+
+**Inherits from:** `Uninitialized< _TempStorage >` (public)
diff --git a/fern/cudapages/cub/cub/cub/BlockScan.mdx b/fern/cudapages/cub/cub/cub/BlockScan.mdx
new file mode 100644
index 0000000..e36ca1d
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/BlockScan.mdx
@@ -0,0 +1,1797 @@
+---
+title: cub::BlockScan
+description: ""
+---
+
+The BlockScan class provides collective methods for computing a parallel prefix sum/scan of items partitioned across a CUDA thread block.
+
+## Performance considerations
+
+- Performance is sensitive to the degree of data movement across the block.
+- Uses special instructions when applicable (e.g., warp `SHFL`)
+- Uses synchronization-free communication between warp lanes when applicable
+- Invokes a minimal number of minimal block-wide synchronization barriers (only
+ one or two depending on algorithm selection)
+- Incurs zero bank conflicts for most types
+- Computation is slightly more efficient (i.e., having lower instruction overhead) for:
+
+ - Prefix sum variants (vs. generic scan)
+ - `BLOCK_THREADS` is a multiple of the architecture's warp size
+
+- See cub::BlockScanAlgorithm for performance details regarding algorithmic alternatives
+
+
+
+
+
+Data type being scanned
+
+
+
+The thread block length in threads along the X dimension
+
+
+
+**[optional]** [cub::BlockScanAlgorithm](/library/api/cub::BlockScanAlgorithm) enumerator specifying the underlying algorithm to use (default: [cub::BLOCK_SCAN_RAKING](/library/api/cub::BLOCK_SCAN_RAKING))
+
+
+
+**[optional]** The thread block length in threads along the Y dimension (default: 1)
+
+
+
+**[optional]** The thread block length in threads along the Z dimension (default: 1)
+
+
+
+
+
+---
+
+## Collective constructors
+
+### BlockScan inline
+
+
+
+
+Collective constructor using a private static allocation of shared memory as temporary storage.
+
+
+```cpp showLineNumbers={false}
+cub::BlockScan::BlockScan()
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+
+
+Collective constructor using the specified memory allocation as temporary storage.
+
+
+```cpp showLineNumbers={false}
+cub::BlockScan::BlockScan(
+ TempStorage &temp_storage
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+**Parameters**
+
+
+Reference to memory allocation having layout type [TempStorage](/library/api/cub::BlockScan::TempStorage)
+
+
+
+
+
+---
+
+## Exclusive prefix sum operations
+
+### ExclusiveSum inline
+
+
+
+
+Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. The value of 0 is applied as the initial value, and is assigned to `output` in *thread*0.
+
+
+```cpp showLineNumbers={false}
+void cub::BlockScan::ExclusiveSum(
+ T input,
+ T &output
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+Uses the identity element (zero) as the initial value.
+Assumes threads are in row-major order.
+
+
+
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Parameters**
+
+
+Calling thread's input item
+
+
+
+Calling thread's output item (may be aliased to `input`)
+
+
+
+
+
+Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. The value of 0 is applied as the initial value, and is assigned to `output` in *thread*0. Also provides every thread with the block-wide `block_aggregate` of all inputs.
+
+
+```cpp showLineNumbers={false}
+void cub::BlockScan::ExclusiveSum(
+ T input,
+ T &output,
+ T &block_aggregate
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+Uses the identity element (zero) as the initial value.
+Assumes threads are in row-major order.
+
+
+
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Parameters**
+
+
+Calling thread's input item
+
+
+
+Calling thread's output item (may be aliased to `input`)
+
+
+
+Block-wide aggregate reduction of input items
+
+
+
+
+
+Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. Instead of using 0 as the block-wide prefix, the call-back functor `block_prefix_callback_op` is invoked by the first warp in the block, and the value returned by *lane*0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockScan::ExclusiveSum(
+ T input,
+ T &output,
+ BlockPrefixCallbackOp &block_prefix_callback_op
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+Uses the identity element (zero) as the initial value.
+The `block_prefix_callback_op` functor must implement a member function `T operator()(T block_aggregate)`. The functor will be invoked by the first warp of threads in the block, however only the return value from *lane*0 is applied as the block-wide prefix. Can be stateful.
+Assumes threads are in row-major order.
+
+
+
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Template parameters**
+
+
+**[inferred]** Call-back functor type having member `T operator()(T block_aggregate)`
+
+
+**Parameters**
+
+
+Calling thread's input item
+
+
+
+Calling thread's output item (may be aliased to `input`)
+
+
+
+Embed:rst:leading-asterisk
+//! *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied to
+//! the logical input sequence.
+//!
+
+
+
+
+
+Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes an array of consecutive input elements. The value of 0 is applied as the initial value, and is assigned to `output[0]` in *thread*0.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockScan::ExclusiveSum(
+ T (&input)[ITEMS_PER_THREAD],
+ T (&output)[ITEMS_PER_THREAD]
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+Uses the identity element (zero) as the initial value.
+Data is in a blocked arrangement across threads.
+Performance is sensitive to the degree of data movement across the block.
+
+
+
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Template parameters**
+
+
+**[inferred]** The number of consecutive items partitioned onto each thread.
+
+
+**Parameters**
+
+
+Calling thread's input items
+
+
+
+Calling thread's output items (may be aliased to `input`)
+
+
+
+
+
+Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes an array of consecutive input elements. The value of 0 is applied as the initial value, and is assigned to `output[0]` in *thread*0. Also provides every thread with the block-wide `block_aggregate` of all inputs.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockScan::ExclusiveSum(
+ T (&input)[ITEMS_PER_THREAD],
+ T (&output)[ITEMS_PER_THREAD],
+ T &block_aggregate
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+Uses the identity element (zero) as the initial value.
+Data is in a blocked arrangement across threads.
+Performance is sensitive to the degree of data movement across the block.
+
+
+
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Template parameters**
+
+
+**[inferred]** The number of consecutive items partitioned onto each thread.
+
+
+**Parameters**
+
+
+Calling thread's input items
+
+
+
+Calling thread's output items (may be aliased to `input`)
+
+
+
+Block-wide aggregate reduction of input items
+
+
+
+
+
+Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes an array of consecutive input elements. Instead of using 0 as the block-wide prefix, the call-back functor `block_prefix_callback_op` is invoked by the first warp in the block, and the value returned by *lane*0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockScan::ExclusiveSum(
+ T (&input)[ITEMS_PER_THREAD],
+ T (&output)[ITEMS_PER_THREAD],
+ BlockPrefixCallbackOp &block_prefix_callback_op
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+Uses the identity element (zero) as the initial value.
+The `block_prefix_callback_op` functor must implement a member function `T operator()(T block_aggregate)`. The functor will be invoked by the first warp of threads in the block, however only the return value from *lane*0 is applied as the block-wide prefix. Can be stateful.
+Data is in a blocked arrangement across threads.
+Performance is sensitive to the degree of data movement across the block.
+
+
+
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Template parameters**
+
+
+**[inferred]** The number of consecutive items partitioned onto each thread.
+
+
+
+**[inferred]** Call-back functor type having member `T operator()(T block_aggregate)`
+
+
+**Parameters**
+
+
+Calling thread's input items
+
+
+
+Calling thread's output items (may be aliased to `input`)
+
+
+
+Embed:rst:leading-asterisk
+//! *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied to
+//! the logical input sequence.
+//!
+
+
+
+
+
+---
+
+## Exclusive prefix scan operations
+
+### ExclusiveScan inline
+
+
+
+
+Computes an exclusive block-wide prefix scan using the specified binary `scan_op` functor. Each thread contributes one input element.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockScan::ExclusiveScan(
+ T input,
+ T &output,
+ T initial_value,
+ ScanOp scan_op
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+Supports non-commutative scan operators.
+Assumes threads are in row-major order.
+
+
+
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Template parameters**
+
+
+**[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
+
+
+**Parameters**
+
+
+Calling thread's input item
+
+
+
+Calling thread's output item (may be aliased to `input`)
+
+
+
+Embed:rst:leading-asterisk
+//! Initial value to seed the exclusive scan (and is assigned to `output[0]` in *thread*\ :sub:`0`)
+//!
+
+
+
+Binary scan functor
+
+
+
+
+
+Computes an exclusive block-wide prefix scan using the specified binary `scan_op` functor. Each thread contributes one input element. Also provides every thread with the block-wide `block_aggregate` of all inputs.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockScan::ExclusiveScan(
+ T input,
+ T &output,
+ T initial_value,
+ ScanOp scan_op,
+ T &block_aggregate
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+Supports non-commutative scan operators.
+Assumes threads are in row-major order.
+
+
+
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Template parameters**
+
+
+**[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
+
+
+**Parameters**
+
+
+Calling thread's input items
+
+
+
+Calling thread's output items (may be aliased to `input`)
+
+
+
+Embed:rst:leading-asterisk
+//! Initial value to seed the exclusive scan (and is assigned to ``output[0]`` in *thread*\ :sub:`0`). It is not
+//! taken into account for ``block_aggregate``.
+//!
+//!
+
+
+
+Binary scan functor
+
+
+
+Block-wide aggregate reduction of input items
+
+
+
+
+
+Computes an exclusive block-wide prefix scan using the specified binary `scan_op` functor. Each thread contributes one input element. The call-back functor `block_prefix_callback_op` is invoked by the first warp in the block, and the value returned by *lane*0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockScan::ExclusiveScan(
+ T input,
+ T &output,
+ ScanOp scan_op,
+ BlockPrefixCallbackOp &block_prefix_callback_op
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The `block_prefix_callback_op` functor must implement a member function `T operator()(T block_aggregate)`. The functor will be invoked by the first warp of threads in the block, however only the return value from *lane*0 is applied as the block-wide prefix. Can be stateful.
+Supports non-commutative scan operators.
+Assumes threads are in row-major order.
+
+
+
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Template parameters**
+
+
+**[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
+
+
+
+**[inferred]** Call-back functor type having member `T operator()(T block_aggregate)`
+
+
+**Parameters**
+
+
+Calling thread's input item
+
+
+
+Calling thread's output item (may be aliased to `input`)
+
+
+
+Binary scan functor
+
+
+
+Embed:rst:leading-asterisk
+//! *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied to
+//! the logical input sequence.
+//!
+
+
+**Example**
+
+The code snippet below illustrates a single thread block that progressively computes an exclusive prefix max scan over multiple "tiles" of input using a prefix functor to maintain a running total between block-wide scans. Each tile consists of 128 integer items that are partitioned across 128 threads.
+
+The corresponding output for the first segment will be `INT_MIN, 0, 0, 2, ..., 124, 126`. The output for the second segment will be `126, 128, 128, 130, ..., 252, 254`.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+// A stateful callback functor that maintains a running prefix to be applied
+// during consecutive scan operations.
+struct BlockPrefixCallbackOp
+{
+ // Running prefix
+ int running_total;
+
+ // Constructor
+ __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
+
+ // Callback operator to be entered by the first warp of threads in the block.
+ // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+ __device__ int operator()(int block_aggregate)
+ {
+ int old_prefix = running_total;
+ running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
+ return old_prefix;
+ }
+};
+
+__global__ void ExampleKernel(int *d_data, int num_items, ...)
+{
+ // Specialize BlockScan for a 1D block of 128 threads
+ using BlockScan = cub::BlockScan;
+
+ // Allocate shared memory for BlockScan
+ __shared__ typename BlockScan::TempStorage temp_storage;
+
+ // Initialize running total
+ BlockPrefixCallbackOp prefix_op(INT_MIN);
+
+ // Have the block iterate over segments of items
+ for (int block_offset = 0; block_offset < num_items; block_offset += 128)
+ {
+ // Load a segment of consecutive items that are blocked across threads
+ int thread_data = d_data[block_offset + threadIdx.x];
+
+ // Collectively compute the block-wide exclusive prefix max scan
+ BlockScan(temp_storage).ExclusiveScan(
+ thread_data, thread_data, INT_MIN, cuda::maximum<>{}, prefix_op);
+ __syncthreads();
+
+ // Store scanned items to output segment
+ d_data[block_offset + threadIdx.x] = thread_data;
+ }
+}
+```
+
+
+
+
+Computes an exclusive block-wide prefix scan using the specified binary `scan_op` functor. Each thread contributes an array of consecutive input elements.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockScan::ExclusiveScan(
+ T (&input)[ITEMS_PER_THREAD],
+ T (&output)[ITEMS_PER_THREAD],
+ T initial_value,
+ ScanOp scan_op
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+Supports non-commutative scan operators.
+Data is in a blocked arrangement across threads.
+Performance is sensitive to the degree of data movement across the block.
+
+
+
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Template parameters**
+
+
+**[inferred]** The number of consecutive items partitioned onto each thread.
+
+
+
+**[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
+
+
+**Parameters**
+
+
+Calling thread's input items
+
+
+
+Calling thread's output items (may be aliased to `input`)
+
+
+
+Embed:rst:leading-asterisk
+//! Initial value to seed the exclusive scan (and is assigned to `output[0]` in *thread*\ :sub:`0`)
+//!
+
+
+
+Binary scan functor
+
+
+
+
+
+Computes an exclusive block-wide prefix scan using the specified binary `scan_op` functor. Each thread contributes an array of consecutive input elements. Also provides every thread with the block-wide `block_aggregate` of all inputs.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockScan::ExclusiveScan(
+ T (&input)[ITEMS_PER_THREAD],
+ T (&output)[ITEMS_PER_THREAD],
+ T initial_value,
+ ScanOp scan_op,
+ T &block_aggregate
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+Supports non-commutative scan operators.
+Data is in a blocked arrangement across threads.
+Performance is sensitive to the degree of data movement across the block.
+
+
+
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Template parameters**
+
+
+**[inferred]** The number of consecutive items partitioned onto each thread.
+
+
+
+**[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
+
+
+**Parameters**
+
+
+Calling thread's input items
+
+
+
+Calling thread's output items (may be aliased to `input`)
+
+
+
+Embed:rst:leading-asterisk
+//! Initial value to seed the exclusive scan (and is assigned to `output[0]` in *thread*\ :sub:`0`). It is not taken
+//! into account for ``block_aggregate``.
+//!
+
+
+
+Binary scan functor
+
+
+
+Block-wide aggregate reduction of input items
+
+
+**Example**
+
+The code snippet below illustrates an exclusive prefix max scan of 512 integer items that are partitioned in a blocked arrangement across 128 threads where each thread owns 4 consecutive items.
+
+`{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }`. The corresponding output `thread_data` in those threads will be `{ [INT_MIN,0,0,2], [2,4,4,6], ..., [506,508,508,510] }`. Furthermore the value `510` will be stored in `block_aggregate` for all threads.
+
+.. note::
+
+`initial_value` is not applied to the block-wide aggregate.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+__global__ void ExampleKernel(...)
+{
+ // Specialize BlockScan for a 1D block of 128 threads of type int
+ using BlockScan = cub::BlockScan;
+
+ // Allocate shared memory for BlockScan
+ __shared__ typename BlockScan::TempStorage temp_storage;
+
+ // Obtain a segment of consecutive items that are blocked across threads
+ int thread_data[4];
+ ...
+
+ // Collectively compute the block-wide exclusive prefix max scan
+ int block_aggregate;
+ BlockScan(temp_storage).ExclusiveScan(
+ thread_data, thread_data, INT_MIN, cuda::maximum<>{}, block_aggregate);
+```
+
+
+
+
+Computes an exclusive block-wide prefix scan using the specified binary `scan_op` functor. Each thread contributes an array of consecutive input elements. The call-back functor `block_prefix_callback_op` is invoked by the first warp in the block, and the value returned by *lane*0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockScan::ExclusiveScan(
+ T (&input)[ITEMS_PER_THREAD],
+ T (&output)[ITEMS_PER_THREAD],
+ ScanOp scan_op,
+ BlockPrefixCallbackOp &block_prefix_callback_op
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The `block_prefix_callback_op` functor must implement a member function `T operator()(T block_aggregate)`. The functor will be invoked by the first warp of threads in the block, however only the return value from *lane*0 is applied as the block-wide prefix. Can be stateful.
+Supports non-commutative scan operators.
+Data is in a blocked arrangement across threads.
+Performance is sensitive to the degree of data movement across the block.
+
+
+
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Template parameters**
+
+
+**[inferred]** The number of consecutive items partitioned onto each thread.
+
+
+
+**[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
+
+
+
+**[inferred]** Call-back functor type having member `T operator()(T block_aggregate)`
+
+
+**Parameters**
+
+
+Calling thread's input items
+
+
+
+Calling thread's output items (may be aliased to `input`)
+
+
+
+Binary scan functor
+
+
+
+Embed:rst:leading-asterisk
+//! *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied to
+//! the logical input sequence.
+//!
+
+
+
+
+
+---
+
+## Inclusive prefix sum operations
+
+### InclusiveSum inline
+
+
+
+
+Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element.
+
+
+```cpp showLineNumbers={false}
+void cub::BlockScan::InclusiveSum(
+ T input,
+ T &output
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+Assumes threads are in row-major order.
+
+
+
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Parameters**
+
+
+Calling thread's input item
+
+
+
+Calling thread's output item (may be aliased to `input`)
+
+
+
+
+
+Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. Also provides every thread with the block-wide `block_aggregate` of all inputs.
+
+
+```cpp showLineNumbers={false}
+void cub::BlockScan::InclusiveSum(
+ T input,
+ T &output,
+ T &block_aggregate
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+Assumes threads are in row-major order.
+
+
+
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Parameters**
+
+
+Calling thread's input item
+
+
+
+Calling thread's output item (may be aliased to `input`)
+
+
+
+Block-wide aggregate reduction of input items
+
+
+
+
+
+Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. Instead of using 0 as the block-wide prefix, the call-back functor `block_prefix_callback_op` is invoked by the first warp in the block, and the value returned by *lane*0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockScan::InclusiveSum(
+ T input,
+ T &output,
+ BlockPrefixCallbackOp &block_prefix_callback_op
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The `block_prefix_callback_op` functor must implement a member function `T operator()(T block_aggregate)`. The functor will be invoked by the first warp of threads in the block, however only the return value from *lane*0 is applied as the block-wide prefix. Can be stateful.
+Assumes threads are in row-major order.
+
+
+
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Template parameters**
+
+
+**[inferred]** Call-back functor type having member `T operator()(T block_aggregate)`
+
+
+**Parameters**
+
+
+Calling thread's input item
+
+
+
+Calling thread's output item (may be aliased to `input`)
+
+
+
+Embed:rst:leading-asterisk
+//! *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied
+//! to the logical input sequence.
+//!
+
+
+**Example**
+
+The code snippet below illustrates a single thread block that progressively computes an inclusive prefix sum over multiple "tiles" of input using a prefix functor to maintain a running total between block-wide scans. Each tile consists of 128 integer items that are partitioned across 128 threads.
+
+The corresponding output for the first segment will be `1, 2, ..., 128`. The output for the second segment will be `129, 130, ..., 256`.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+// A stateful callback functor that maintains a running prefix to be applied
+// during consecutive scan operations.
+struct BlockPrefixCallbackOp
+{
+ // Running prefix
+ int running_total;
+
+ // Constructor
+ __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
+
+ // Callback operator to be entered by the first warp of threads in the block.
+ // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+ __device__ int operator()(int block_aggregate)
+ {
+ int old_prefix = running_total;
+ running_total += block_aggregate;
+ return old_prefix;
+ }
+};
+
+__global__ void ExampleKernel(int *d_data, int num_items, ...)
+{
+ // Specialize BlockScan for a 1D block of 128 threads
+ using BlockScan = cub::BlockScan;
+
+ // Allocate shared memory for BlockScan
+ __shared__ typename BlockScan::TempStorage temp_storage;
+
+ // Initialize running total
+ BlockPrefixCallbackOp prefix_op(0);
+
+ // Have the block iterate over segments of items
+ for (int block_offset = 0; block_offset < num_items; block_offset += 128)
+ {
+ // Load a segment of consecutive items that are blocked across threads
+ int thread_data = d_data[block_offset + threadIdx.x];
+
+ // Collectively compute the block-wide inclusive prefix sum
+ BlockScan(temp_storage).InclusiveSum(
+ thread_data, thread_data, prefix_op);
+ __syncthreads();
+
+ // Store scanned items to output segment
+ d_data[block_offset + threadIdx.x] = thread_data;
+ }
+```
+
+
+
+
+Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes an array of consecutive input elements.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockScan::InclusiveSum(
+ T (&input)[ITEMS_PER_THREAD],
+ T (&output)[ITEMS_PER_THREAD]
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+Data is in a blocked arrangement across threads.
+Performance is sensitive to the degree of data movement across the block.
+
+
+
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Template parameters**
+
+
+**[inferred]** The number of consecutive items partitioned onto each thread.
+
+
+**Parameters**
+
+
+Calling thread's input items
+
+
+
+Calling thread's output items (may be aliased to `input`)
+
+
+
+
+
+Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes an array of consecutive input elements. Also provides every thread with the block-wide `block_aggregate` of all inputs.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockScan::InclusiveSum(
+ T (&input)[ITEMS_PER_THREAD],
+ T (&output)[ITEMS_PER_THREAD],
+ T &block_aggregate
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+Data is in a blocked arrangement across threads.
+Performance is sensitive to the degree of data movement across the block.
+
+
+
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Template parameters**
+
+
+**[inferred]** The number of consecutive items partitioned onto each thread.
+
+
+**Parameters**
+
+
+Calling thread's input items
+
+
+
+Calling thread's output items (may be aliased to `input`)
+
+
+
+Block-wide aggregate reduction of input items
+
+
+
+
+
+Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes an array of consecutive input elements. Instead of using 0 as the block-wide prefix, the call-back functor `block_prefix_callback_op` is invoked by the first warp in the block, and the value returned by *lane*0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockScan::InclusiveSum(
+ T (&input)[ITEMS_PER_THREAD],
+ T (&output)[ITEMS_PER_THREAD],
+ BlockPrefixCallbackOp &block_prefix_callback_op
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The `block_prefix_callback_op` functor must implement a member function `T operator()(T block_aggregate)`. The functor will be invoked by the first warp of threads in the block, however only the return value from *lane*0 is applied as the block-wide prefix. Can be stateful.
+Data is in a blocked arrangement across threads.
+Performance is sensitive to the degree of data movement across the block.
+
+
+
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Template parameters**
+
+
+**[inferred]** The number of consecutive items partitioned onto each thread.
+
+
+
+**[inferred]** Call-back functor type having member `T operator()(T block_aggregate)`
+
+
+**Parameters**
+
+
+Calling thread's input items
+
+
+
+Calling thread's output items (may be aliased to `input`)
+
+
+
+Embed:rst:leading-asterisk
+//! *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied to the
+//! logical input sequence.
+//!
+
+
+
+
+
+---
+
+## Inclusive prefix scan operations
+
+### InclusiveScan inline
+
+
+
+
+Computes an inclusive block-wide prefix scan using the specified binary `scan_op` functor. Each thread contributes one input element.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockScan::InclusiveScan(
+ T input,
+ T &output,
+ ScanOp scan_op
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+Supports non-commutative scan operators.
+Assumes threads are in row-major order.
+
+
+
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Template parameters**
+
+
+**[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
+
+
+**Parameters**
+
+
+Calling thread's input item
+
+
+
+Calling thread's output item (may be aliased to `input`)
+
+
+
+Binary scan functor
+
+
+
+
+
+Computes an inclusive block-wide prefix scan using the specified binary `scan_op` functor. Each thread contributes one input element. Also provides every thread with the block-wide `block_aggregate` of all inputs.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockScan::InclusiveScan(
+ T input,
+ T &output,
+ ScanOp scan_op,
+ T &block_aggregate
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+Supports non-commutative scan operators.
+Assumes threads are in row-major order.
+
+
+
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Template parameters**
+
+
+**[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
+
+
+**Parameters**
+
+
+Calling thread's input item
+
+
+
+Calling thread's output item (may be aliased to `input`)
+
+
+
+Binary scan functor
+
+
+
+Block-wide aggregate reduction of input items
+
+
+**Example**
+
+The code snippet below illustrates an inclusive prefix max scan of 128 integer items that are partitioned across 128 threads.
+
+`0, -1, 2, -3, ..., 126, -127`. The corresponding output `thread_data` in those threads will be `0, 0, 2, 2, ..., 126, 126`. Furthermore the value `126` will be stored in `block_aggregate` for all threads.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+__global__ void ExampleKernel(...)
+{
+ // Specialize BlockScan for a 1D block of 128 threads of type int
+ using BlockScan = cub::BlockScan;
+
+ // Allocate shared memory for BlockScan
+ __shared__ typename BlockScan::TempStorage temp_storage;
+
+ // Obtain input item for each thread
+ int thread_data;
+ ...
+
+ // Collectively compute the block-wide inclusive prefix max scan
+ int block_aggregate;
+ BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cuda::maximum<>{}, block_aggregate);
+```
+
+
+
+
+Computes an inclusive block-wide prefix scan using the specified binary `scan_op` functor. Each thread contributes one input element. The call-back functor `block_prefix_callback_op` is invoked by the first warp in the block, and the value returned by *lane*0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockScan::InclusiveScan(
+ T input,
+ T &output,
+ ScanOp scan_op,
+ BlockPrefixCallbackOp &block_prefix_callback_op
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The `block_prefix_callback_op` functor must implement a member function `T operator()(T block_aggregate)`. The functor's input parameter The functor will be invoked by the first warp of threads in the block, however only the return value from *lane*0 is applied as the block-wide prefix. Can be stateful.
+Supports non-commutative scan operators.
+Assumes threads are in row-major order.
+
+
+
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Template parameters**
+
+
+**[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
+
+
+
+**[inferred]** Call-back functor type having member `T operator()(T block_aggregate)`
+
+
+**Parameters**
+
+
+Calling thread's input item
+
+
+
+Calling thread's output item (may be aliased to `input`)
+
+
+
+Binary scan functor
+
+
+
+Embed:rst:leading-asterisk
+//! *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied to
+//! the logical input sequence.
+//!
+
+
+
+
+
+Computes an inclusive block-wide prefix scan using the specified binary `scan_op` functor. Each thread contributes an array of consecutive input elements.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockScan::InclusiveScan(
+ T (&input)[ITEMS_PER_THREAD],
+ T (&output)[ITEMS_PER_THREAD],
+ ScanOp scan_op
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+Supports non-commutative scan operators.
+Data is in a blocked arrangement across threads.
+Performance is sensitive to the degree of data movement across the block.
+
+
+
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Template parameters**
+
+
+**[inferred]** The number of consecutive items partitioned onto each thread.
+
+
+
+**[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
+
+
+**Parameters**
+
+
+Calling thread's input items
+
+
+
+Calling thread's output items (may be aliased to `input`)
+
+
+
+Binary scan functor
+
+
+
+
+
+Computes an inclusive block-wide prefix scan using the specified binary `scan_op` functor. Each thread contributes an array of consecutive input elements.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockScan::InclusiveScan(
+ T (&input)[ITEMS_PER_THREAD],
+ T (&output)[ITEMS_PER_THREAD],
+ T initial_value,
+ ScanOp scan_op
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+Supports non-commutative scan operators.
+Data is in a blocked arrangement across threads.
+Performance is sensitive to the degree of data movement across the block.
+
+
+
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Template parameters**
+
+
+**[inferred]** The number of consecutive items partitioned onto each thread.
+
+
+
+**[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
+
+
+**Parameters**
+
+
+Calling thread's input items
+
+
+
+Calling thread's output items (may be aliased to `input`)
+
+
+
+Initial value to seed the inclusive scan (uniform across block)
+
+
+
+Binary scan functor
+
+
+
+
+
+Computes an inclusive block-wide prefix scan using the specified binary `scan_op` functor. Each thread contributes an array of consecutive input elements. Also provides every thread with the block-wide `block_aggregate` of all inputs.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockScan::InclusiveScan(
+ T (&input)[ITEMS_PER_THREAD],
+ T (&output)[ITEMS_PER_THREAD],
+ ScanOp scan_op,
+ T &block_aggregate
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+Supports non-commutative scan operators.
+Data is in a blocked arrangement across threads.
+Performance is sensitive to the degree of data movement across the block.
+
+
+
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Template parameters**
+
+
+**[inferred]** The number of consecutive items partitioned onto each thread.
+
+
+
+**[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
+
+
+**Parameters**
+
+
+Calling thread's input items
+
+
+
+Calling thread's output items (may be aliased to `input`)
+
+
+
+Binary scan functor
+
+
+
+Block-wide aggregate reduction of input items
+
+
+**Example**
+
+The code snippet below illustrates an inclusive prefix max scan of 512 integer items that are partitioned in a [blocked arrangement](../index.html#sec5sec3) across 128 threads where each thread owns 4 consecutive items.
+
+`{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }`. The corresponding output `thread_data` in those threads will be `{ [0,0,2,2], [4,4,6,6], ..., [508,508,510,510] }`. Furthermore the value `510` will be stored in `block_aggregate` for all threads.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+__global__ void ExampleKernel(...)
+{
+ // Specialize BlockScan for a 1D block of 128 threads of type int
+ using BlockScan = cub::BlockScan;
+
+ // Allocate shared memory for BlockScan
+ __shared__ typename BlockScan::TempStorage temp_storage;
+
+ // Obtain a segment of consecutive items that are blocked across threads
+ int thread_data[4];
+ ...
+
+ // Collectively compute the block-wide inclusive prefix max scan
+ int block_aggregate;
+ BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cuda::maximum<>{}, block_aggregate);
+```
+
+
+
+
+Computes an inclusive block-wide prefix scan using the specified binary `scan_op` functor. Each thread contributes an array of consecutive input elements. Also provides every thread with the block-wide `block_aggregate` of all inputs.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockScan::InclusiveScan(
+ T (&input)[ITEMS_PER_THREAD],
+ T (&output)[ITEMS_PER_THREAD],
+ T initial_value,
+ ScanOp scan_op,
+ T &block_aggregate
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+Supports non-commutative scan operators.
+Data is in a blocked arrangement across threads.
+Performance is sensitive to the degree of data movement across the block.
+
+
+
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Template parameters**
+
+
+**[inferred]** The number of consecutive items partitioned onto each thread.
+
+
+
+**[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
+
+
+**Parameters**
+
+
+Calling thread's input items
+
+
+
+Calling thread's output items (may be aliased to `input`)
+
+
+
+Initial value to seed the inclusive scan (uniform across block). It is not taken into account for `block_aggregate`.
+
+
+
+Binary scan functor
+
+
+
+Block-wide aggregate reduction of input items
+
+
+
+
+
+Computes an inclusive block-wide prefix scan using the specified binary `scan_op` functor. Each thread contributes an array of consecutive input elements. The call-back functor `block_prefix_callback_op` is invoked by the first warp in the block, and the value returned by *lane*0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockScan::InclusiveScan(
+ T (&input)[ITEMS_PER_THREAD],
+ T (&output)[ITEMS_PER_THREAD],
+ ScanOp scan_op,
+ BlockPrefixCallbackOp &block_prefix_callback_op
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The `block_prefix_callback_op` functor must implement a member function `T operator()(T block_aggregate)`. The functor will be invoked by the first warp of threads in the block, however only the return value from *lane*0 is applied as the block-wide prefix. Can be stateful.
+Supports non-commutative scan operators.
+Data is in a blocked arrangement across threads.
+Performance is sensitive to the degree of data movement across the block.
+
+
+
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Template parameters**
+
+
+**[inferred]** The number of consecutive items partitioned onto each thread.
+
+
+
+**[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
+
+
+
+**[inferred]** Call-back functor type having member `T operator()(T block_aggregate)`
+
+
+**Parameters**
+
+
+Calling thread's input items
+
+
+
+Calling thread's output items (may be aliased to `input`)
+
+
+
+Binary scan functor
+
+
+
+Embed:rst:leading-asterisk
+//! *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied to
+//! the logical input sequence.
+//!
+
+
+**Example**
+
+The code snippet below illustrates a single thread block that progressively computes an inclusive prefix max scan over multiple "tiles" of input using a prefix functor to maintain a running total between block-wide scans. Each tile consists of 128 integer items that are partitioned across 128 threads.
+
+The corresponding output for the first segment will be `0, 0, 2, 2, 4, 4, ..., 510, 510`. The output for the second segment will be `512, 512, 514, 514, 516, 516, ..., 1022, 1022`.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+// A stateful callback functor that maintains a running prefix to be applied
+// during consecutive scan operations.
+struct BlockPrefixCallbackOp
+{
+ // Running prefix
+ int running_total;
+
+ // Constructor
+ __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
+
+ // Callback operator to be entered by the first warp of threads in the block.
+ // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+ __device__ int operator()(int block_aggregate)
+ {
+ int old_prefix = running_total;
+ running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
+ return old_prefix;
+ }
+};
+
+__global__ void ExampleKernel(int *d_data, int num_items, ...)
+{
+ // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread
+ using BlockLoad = cub::BlockLoad ;
+ using BlockStore = cub::BlockStore ;
+ using BlockScan = cub::BlockScan ;
+
+ // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
+ __shared__ union {
+ typename BlockLoad::TempStorage load;
+ typename BlockScan::TempStorage scan;
+ typename BlockStore::TempStorage store;
+ } temp_storage;
+
+ // Initialize running total
+ BlockPrefixCallbackOp prefix_op(0);
+
+ // Have the block iterate over segments of items
+ for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
+ {
+ // Load a segment of consecutive items that are blocked across threads
+ int thread_data[4];
+ BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
+ __syncthreads();
+
+ // Collectively compute the block-wide inclusive prefix max scan
+ BlockScan(temp_storage.scan).InclusiveScan(
+ thread_data, thread_data, cuda::maximum<>{}, prefix_op);
+ __syncthreads();
+
+ // Store scanned items to output segment
+ BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
+ __syncthreads();
+ }
+```
+
+
+
+
+---
+
+## Utility methods
+
+### PrivateStorage inline
+
+Internal storage allocator.
+
+
+```cpp showLineNumbers={false}
+_TempStorage & cub::BlockScan::PrivateStorage()
+```
+
+
+---
+
+## Types
+
+### Typedefs
+
+| Name | Definition | Description |
+|---|---|---|
+| `WarpScans` | `detail::BlockScanWarpScans< T, BlockDimX, BlockDimY, BlockDimZ >` | |
+| `Raking` | `detail::BlockScanRaking< T, BlockDimX, BlockDimY, BlockDimZ,(SAFE_ALGORITHM==BLOCK_SCAN_RAKING_MEMOIZE)>` | |
+| `InternalBlockScan` | `::cuda::std::_If< SAFE_ALGORITHM==BLOCK_SCAN_WARP_SCANS, WarpScans, Raking >` | Define the delegate type for the desired algorithm. |
+| `_TempStorage` | `typename InternalBlockScan::TempStorage` | Shared memory storage layout type for `BlockScan`. |
+
+---
+
+## Member variables
+
+| Name | Type | Description |
+|---|---|---|
+| `BLOCK_THREADS` static constexpr | `int` | The thread block size in threads. |
+| `SAFE_ALGORITHM` static constexpr | `BlockScanAlgorithm` | Ensure the template parameterization meets the requirements of the specified algorithm. |
+| `temp_storage` | `_TempStorage &` | Shared storage reference. |
+| `linear_tid` | `unsigned int` | Linear thread-id. |
+
+---
+
+## Inner classes
+
+### TempStorage
+
+
+```cpp showLineNumbers={false}
+struct cub::BlockScan::TempStorage
+```
+
+
+The operations exposed by `BlockScan` require a temporary memory allocation of this nested type for thread communication. This opaque storage can be allocated directly using the `__shared__` keyword. Alternatively, it can be aliased to externally allocated memory (shared or global) or `union`'d with other storage allocation types to facilitate memory reuse.
+
+**Inherits from:** `Uninitialized< _TempStorage >` (public)
diff --git a/fern/cudapages/cub/cub/cub/BlockScanRunningPrefixOp.mdx b/fern/cudapages/cub/cub/cub/BlockScanRunningPrefixOp.mdx
new file mode 100644
index 0000000..111eb37
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/BlockScanRunningPrefixOp.mdx
@@ -0,0 +1,91 @@
+---
+title: cub::BlockScanRunningPrefixOp
+description: "Stateful callback operator type for supplying [BlockScan](/library/api/cub::_block_scan) prefixes."
+---
+
+Stateful callback operator type for supplying [BlockScan](/library/api/cub::_block_scan) prefixes.
+
+Maintains a running prefix that can be applied to consecutive [BlockScan](/library/api/cub::_block_scan) operations.
+
+
+
+
+
+[BlockScan](/library/api/cub::_block_scan) value type
+
+
+
+Wrapped scan operator type
+
+
+
+
+
+---
+
+## Constructors
+
+### BlockScanRunningPrefixOp inline
+
+
+
+
+Constructor.
+
+
+```cpp showLineNumbers={false}
+cub::BlockScanRunningPrefixOp::BlockScanRunningPrefixOp(
+ ScanOpT op
+)
+```
+
+
+
+
+
+Constructor.
+
+
+```cpp showLineNumbers={false}
+cub::BlockScanRunningPrefixOp::BlockScanRunningPrefixOp(
+ T starting_prefix,
+ ScanOpT op
+)
+```
+
+
+
+
+
+---
+
+## Methods
+
+### operator() inline
+
+Prefix callback operator.
+
+Returns the block-wide running_total in thread-0.
+
+
+```cpp showLineNumbers={false}
+T cub::BlockScanRunningPrefixOp::operator()(
+ const T &block_aggregate
+)
+```
+
+
+**Parameters**
+
+
+The aggregate sum of the [BlockScan](/library/api/cub::_block_scan) inputs
+
+
+---
+
+## Member variables
+
+| Name | Type | Description |
+|---|---|---|
+| `op` | `ScanOpT` | Wrapped scan operator. |
+| `running_total` | `T` | Running block-wide prefix. |
diff --git a/fern/cudapages/cub/cub/cub/BlockShuffle.mdx b/fern/cudapages/cub/cub/cub/BlockShuffle.mdx
new file mode 100644
index 0000000..3942086
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/BlockShuffle.mdx
@@ -0,0 +1,344 @@
+---
+title: cub::BlockShuffle
+description: ""
+---
+
+The BlockShuffle class provides collective methods for shuffling data partitioned across a CUDA thread block.
+
+
+
+
+
+The data type to be exchanged.
+
+
+
+The thread block length in threads along the X dimension
+
+
+
+**[optional]** The thread block length in threads along the Y dimension (default: 1)
+
+
+
+**[optional]** The thread block length in threads along the Z dimension (default: 1)
+
+
+
+
+
+---
+
+## Collective constructors
+
+### BlockShuffle inline
+
+
+
+
+Collective constructor using a private static allocation of shared memory as temporary storage.
+
+
+```cpp showLineNumbers={false}
+cub::BlockShuffle::BlockShuffle()
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+
+
+Collective constructor using the specified memory allocation as temporary storage.
+
+
+```cpp showLineNumbers={false}
+cub::BlockShuffle::BlockShuffle(
+ TempStorage &temp_storage
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+**Parameters**
+
+
+Reference to memory allocation having layout type [TempStorage](/library/api/cub::BlockShuffle::TempStorage)
+
+
+
+
+
+---
+
+## Shuffle movement
+
+### Offset inline
+
+Each *thread*i obtains the `input` provided by *thread*i + distance. The offset `distance` may be negative.
+
+
+```cpp showLineNumbers={false}
+void cub::BlockShuffle::Offset(
+ T input,
+ T &output,
+ int distance = 1
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Parameters**
+
+
+Embed:rst:leading-asterisk
+//! The input item from the calling thread (*thread*\ :sub:`i`)
+//!
+
+
+
+Embed:rst:leading-asterisk
+//! The ``input`` item from the successor (or predecessor) thread
+//! *thread*\ :sub:`i + distance` (may be aliased to ``input``).
+//! This value is only updated for for *thread*\ :sub:`i` when
+//! ``0 <= (i + distance) < BLOCK_THREADS - 1``
+//!
+
+
+
+Offset distance (may be negative)
+
+
+### Up inline
+
+
+
+
+Each *thread*i obtains the `input` provided by *thread*i + distance.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockShuffle::Up(
+ T (&input)[ITEMS_PER_THREAD],
+ T (&prev)[ITEMS_PER_THREAD]
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Parameters**
+
+
+The calling thread's input item
+
+
+
+Embed:rst:leading-asterisk
+//! The corresponding predecessor items (may be aliased to ``input``).
+//! The item ``prev[0]`` is not updated for *thread*\ :sub:`0`.
+//!
+
+
+
+
+
+The thread block rotates its blocked arrangement of `input` items, shifting it up by one item. All threads receive the `input` provided by *thread*BLOCK_THREADS - 1.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockShuffle::Up(
+ T (&input)[ITEMS_PER_THREAD],
+ T (&prev)[ITEMS_PER_THREAD],
+ T &block_suffix
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+Data is in a blocked arrangement across threads.
+Performance is sensitive to the degree of data movement across the block.
+
+
+
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Parameters**
+
+
+The calling thread's input items
+
+
+
+Embed:rst:leading-asterisk
+//! The corresponding predecessor items (may be aliased to ``input``).
+//! The item ``prev[0]`` is not updated for *thread*\ :sub:`0`.
+//!
+
+
+
+Embed:rst:leading-asterisk
+//! The item ``input[ITEMS_PER_THREAD - 1]`` from *thread*\ :sub:`BLOCK_THREADS - 1`, provided to all threads
+//!
+
+
+
+
+
+### Down inline
+
+
+
+
+The thread block rotates its blocked arrangement of `input` items, shifting it down by one item.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockShuffle::Down(
+ T (&input)[ITEMS_PER_THREAD],
+ T (&prev)[ITEMS_PER_THREAD]
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+Data is in a blocked arrangement across threads.
+Performance is sensitive to the degree of data movement across the block.
+
+
+
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Parameters**
+
+
+The calling thread's input items
+
+
+
+Embed:rst:leading-asterisk
+//! The corresponding predecessor items (may be aliased to ``input``).
+//! The value ``prev[0]`` is not updated for *thread*\ :sub:`BLOCK_THREADS - 1`.
+//!
+
+
+
+
+
+The thread block rotates its blocked arrangement of input items, shifting it down by one item. All threads receive `input[0]` provided by *thread*0.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockShuffle::Down(
+ T (&input)[ITEMS_PER_THREAD],
+ T (&prev)[ITEMS_PER_THREAD],
+ T &block_prefix
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+Data is in a blocked arrangement across threads.
+Performance is sensitive to the degree of data movement across the block.
+
+
+
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Parameters**
+
+
+The calling thread's input items
+
+
+
+Embed:rst:leading-asterisk
+//! The corresponding predecessor items (may be aliased to ``input``).
+//! The value ``prev[0]`` is not updated for *thread*\ :sub:`BLOCK_THREADS - 1`.
+//!
+
+
+
+Embed:rst:leading-asterisk
+//! The item ``input[0]`` from *thread*\ :sub:`0`, provided to all threads
+//!
+
+
+
+
+
+---
+
+## Utility methods
+
+### PrivateStorage inline
+
+Internal storage allocator.
+
+
+```cpp showLineNumbers={false}
+_TempStorage & cub::BlockShuffle::PrivateStorage()
+```
+
+
+---
+
+## Types
+
+### Typedefs
+
+| Name | Definition | Description |
+|---|---|---|
+| `_TempStorage` | `T[BLOCK_THREADS]` | Shared memory storage layout type (last element from each thread's input). |
+
+---
+
+## Member variables
+
+| Name | Type | Description |
+|---|---|---|
+| `BLOCK_THREADS` static constexpr | `int` | |
+| `LOG_WARP_THREADS` static constexpr | `int` | |
+| `WARP_THREADS` static constexpr | `int` | |
+| `WARPS` static constexpr | `int` | |
+| `temp_storage` | `_TempStorage &` | Shared storage reference. |
+| `linear_tid` | `unsigned int` | Linear thread-id. |
+
+---
+
+## Inner classes
+
+### TempStorage
+
+
+```cpp showLineNumbers={false}
+struct cub::BlockShuffle::TempStorage
+```
+
+
+The operations exposed by `BlockShuffle` require a temporary memory allocation of this nested type for thread communication. This opaque storage can be allocated directly using the `__shared__` keyword. Alternatively, it can be aliased to externally allocated memory (shared or global) or `union`'d with other storage allocation types to facilitate memory reuse.
+
+**Inherits from:** `Uninitialized< _TempStorage >` (public)
diff --git a/fern/cudapages/cub/cub/cub/BlockStore.mdx b/fern/cudapages/cub/cub/cub/BlockStore.mdx
new file mode 100644
index 0000000..aa6978d
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/BlockStore.mdx
@@ -0,0 +1,388 @@
+---
+title: cub::BlockStore
+description: ""
+---
+
+The BlockStore class provides collective data movement methods for writing a blocked arrangement of items partitioned across a CUDA thread block to a linear segment of memory.
+
+## Example
+
+The code snippet below illustrates the storing of a "blocked" arrangement of 512 integers across 128 threads (where each thread owns 4 consecutive items) into a linear segment of memory. The store is specialized for `BLOCK_STORE_WARP_TRANSPOSE`, meaning items are locally reordered among threads so that memory references will be efficiently coalesced using a warp-striped access pattern.
+
+`{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }`. The output `d_data` will be `0, 1, 2, 3, 4, 5, ...`.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+__global__ void ExampleKernel(int *d_data, ...)
+{
+ // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each
+ using BlockStore = cub::BlockStore;
+
+ // Allocate shared memory for BlockStore
+ __shared__ typename BlockStore::TempStorage temp_storage;
+
+ // Obtain a segment of consecutive items that are blocked across threads
+ int thread_data[4];
+ ...
+
+ // Store items to linear memory
+ BlockStore(temp_storage).Store(d_data, thread_data);
+}
+```
+
+
+
+
+
+The type of data to be written.
+
+
+
+The thread block length in threads along the X dimension
+
+
+
+The number of consecutive items partitioned onto each thread.
+
+
+
+
+
+
+**[optional]** The thread block length in threads along the Y dimension (default: 1)
+
+
+
+**[optional]** The thread block length in threads along the Z dimension (default: 1)
+
+
+
+
+
+---
+
+## Collective constructors
+
+### BlockStore inline
+
+
+
+
+Collective constructor using a private static allocation of shared memory as temporary storage.
+
+
+```cpp showLineNumbers={false}
+cub::BlockStore::BlockStore()
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+
+
+Collective constructor using the specified memory allocation as temporary storage.
+
+
+```cpp showLineNumbers={false}
+cub::BlockStore::BlockStore(
+ TempStorage &temp_storage
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+**Parameters**
+
+
+Reference to memory allocation having layout type [TempStorage](/library/api/cub::BlockStore::TempStorage)
+
+
+
+
+
+---
+
+## Data movement
+
+### Store inline
+
+
+
+
+Store items into a linear segment of memory
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockStore::Store(
+ OutputIteratorT block_itr,
+ T (&items)[ItemsPerThread]
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+Data is in a blocked arrangement across threads.
+
+
+
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Parameters**
+
+
+The thread block's base output iterator for storing to
+
+
+
+Data to store
+
+
+**Example**
+
+The code snippet below illustrates the storing of a "blocked" arrangement of 512 integers across 128 threads (where each thread owns 4 consecutive items) into a linear segment of memory. The store is specialized for `BLOCK_STORE_WARP_TRANSPOSE`, meaning items are locally reordered among threads so that memory references will be efficiently coalesced using a warp-striped access pattern.
+
+`{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }`. The output `d_data` will be `0, 1, 2, 3, 4, 5, ...`.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+__global__ void ExampleKernel(int *d_data, ...)
+{
+ // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each
+ using BlockStore = cub::BlockStore;
+
+ // Allocate shared memory for BlockStore
+ __shared__ typename BlockStore::TempStorage temp_storage;
+
+ // Obtain a segment of consecutive items that are blocked across threads
+ int thread_data[4];
+ ...
+
+ // Store items to linear memory
+ BlockStore(temp_storage).Store(d_data, thread_data);
+}
+```
+
+
+
+
+Store items into a linear segment of memory, guarded by range.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::BlockStore::Store(
+ OutputIteratorT block_itr,
+ T (&items)[ItemsPerThread],
+ int valid_items
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+Data is in a blocked arrangement across threads.
+
+
+
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Parameters**
+
+
+The thread block's base output iterator for storing to
+
+
+
+Data to store
+
+
+
+Number of valid items to write
+
+
+**Example**
+
+The code snippet below illustrates the guarded storing of a "blocked" arrangement of 512 integers across 128 threads (where each thread owns 4 consecutive items) into a linear segment of memory. The store is specialized for `BLOCK_STORE_WARP_TRANSPOSE`, meaning items are locally reordered among threads so that memory references will be efficiently coalesced using a warp-striped access pattern.
+
+`{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }` and `valid_items` is `5`. The output `d_data` will be `0, 1, 2, 3, 4, ?, ?, ?, ...`, with only the first two threads being unmasked to store portions of valid data.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+__global__ void ExampleKernel(int *d_data, int valid_items, ...)
+{
+ // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each
+ using BlockStore = cub::BlockStore;
+
+ // Allocate shared memory for BlockStore
+ __shared__ typename BlockStore::TempStorage temp_storage;
+
+ // Obtain a segment of consecutive items that are blocked across threads
+ int thread_data[4];
+ ...
+
+ // Store items to linear memory
+ BlockStore(temp_storage).Store(d_data, thread_data, valid_items);
+}
+```
+
+
+
+
+---
+
+## Utility methods
+
+### PrivateStorage inline
+
+Internal storage allocator.
+
+
+```cpp showLineNumbers={false}
+_TempStorage & cub::BlockStore::PrivateStorage()
+```
+
+
+---
+
+## Types
+
+### Typedefs
+
+| Name | Definition | Description |
+|---|---|---|
+| `InternalStore` | `StoreInternal< Algorithm, 0 >` | Internal load implementation to use. |
+| `_TempStorage` | `typename InternalStore::TempStorage` | Shared memory storage layout type. |
+
+---
+
+## Member variables
+
+| Name | Type | Description |
+|---|---|---|
+| `BLOCK_THREADS` static constexpr | `int` | The thread block size in threads. |
+| `temp_storage` | `_TempStorage &` | Thread reference to shared storage. |
+| `linear_tid` | `int` | Linear thread-id. |
+
+---
+
+## Inner classes
+
+### StoreInternal
+
+
+```cpp showLineNumbers={false}
+struct cub::BlockStore::StoreInternal
+```
+
+
+Store helper.
+
+### StoreInternal< BLOCK_STORE_DIRECT, DUMMY >
+
+
+```cpp showLineNumbers={false}
+struct cub::BlockStore::StoreInternal< BLOCK_STORE_DIRECT, DUMMY >
+```
+
+
+| Name | Type | Description |
+|---|---|---|
+| `linear_tid` | `int` | Linear thread-id. |
+
+### StoreInternal< BLOCK_STORE_STRIPED, DUMMY >
+
+
+```cpp showLineNumbers={false}
+struct cub::BlockStore::StoreInternal< BLOCK_STORE_STRIPED, DUMMY >
+```
+
+
+BLOCK_STORE_STRIPED specialization of store helper.
+
+| Name | Type | Description |
+|---|---|---|
+| `linear_tid` | `int` | Linear thread-id. |
+
+### StoreInternal< BLOCK_STORE_VECTORIZE, DUMMY >
+
+
+```cpp showLineNumbers={false}
+struct cub::BlockStore::StoreInternal< BLOCK_STORE_VECTORIZE, DUMMY >
+```
+
+
+BLOCK_STORE_VECTORIZE specialization of store helper.
+
+| Name | Type | Description |
+|---|---|---|
+| `linear_tid` | `int` | Linear thread-id. |
+
+### StoreInternal< BLOCK_STORE_TRANSPOSE, DUMMY >
+
+
+```cpp showLineNumbers={false}
+struct cub::BlockStore::StoreInternal< BLOCK_STORE_TRANSPOSE, DUMMY >
+```
+
+
+BLOCK_STORE_TRANSPOSE specialization of store helper.
+
+| Name | Type | Description |
+|---|---|---|
+| `temp_storage` | `_TempStorage &` | Thread reference to shared storage. |
+| `linear_tid` | `int` | Linear thread-id. |
+
+### StoreInternal< BLOCK_STORE_WARP_TRANSPOSE, DUMMY >
+
+
+```cpp showLineNumbers={false}
+struct cub::BlockStore::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE, DUMMY >
+```
+
+
+BLOCK_STORE_WARP_TRANSPOSE specialization of store helper.
+
+| Name | Type | Description |
+|---|---|---|
+| `WARP_THREADS` static constexpr | `int` | |
+| `temp_storage` | `_TempStorage &` | Thread reference to shared storage. |
+| `linear_tid` | `int` | Linear thread-id. |
+
+### StoreInternal< BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, DUMMY >
+
+
+```cpp showLineNumbers={false}
+struct cub::BlockStore::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, DUMMY >
+```
+
+
+BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED specialization of store helper.
+
+| Name | Type | Description |
+|---|---|---|
+| `WARP_THREADS` static constexpr | `int` | |
+| `temp_storage` | `_TempStorage &` | Thread reference to shared storage. |
+| `linear_tid` | `int` | Linear thread-id. |
+
+### TempStorage
+
+
+```cpp showLineNumbers={false}
+struct cub::BlockStore::TempStorage
+```
+
+
+The operations exposed by `BlockStore` require a temporary memory allocation of this nested type for thread communication. This opaque storage can be allocated directly using the `__shared__` keyword. Alternatively, it can be aliased to externally allocated memory (shared or global) or `union`'d with other storage allocation types to facilitate memory reuse.
+
+**Inherits from:** `Uninitialized< _TempStorage >` (public)
diff --git a/fern/cudapages/cub/cub/cub/CacheModifiedInputIterator.mdx b/fern/cudapages/cub/cub/cub/CacheModifiedInputIterator.mdx
new file mode 100644
index 0000000..fdda0b9
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/CacheModifiedInputIterator.mdx
@@ -0,0 +1,265 @@
+---
+title: cub::CacheModifiedInputIterator
+description: "A random-access input wrapper for dereferencing array values using a PTX cache load modifier."
+---
+
+A random-access input wrapper for dereferencing array values using a PTX cache load modifier.
+
+**Overview**
+
+- `CacheModifiedInputIterator` is a random-access input iterator that wraps a native device pointer of type `ValueType*`. `ValueType` references are made by reading `ValueType` values through loads modified by `MODIFIER`.
+- Can be used to load any data type from memory using PTX cache load modifiers (e.g., "LOAD_LDG", "LOAD_CG", "LOAD_CA", "LOAD_CS", "LOAD_CV", etc.).
+- Can be constructed, manipulated, and exchanged within and between host and device functions, but can only be dereferenced within device functions.
+- Compatible with Thrust API v1.7 or newer.
+
+**Snippet**
+
+The code snippet below illustrates the use of `CacheModifiedInputIterator` to dereference a device array of double using the "ldg" PTX load modifier (i.e., load values through texture cache).
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+// Declare, allocate, and initialize a device array
+double *d_in; // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0]
+
+// Create an iterator wrapper
+cub::CacheModifiedInputIterator itr(d_in);
+
+// Within device code:
+printf("%f\n", itr[0]); // 8.0
+printf("%f\n", itr[1]); // 6.0
+printf("%f\n", itr[6]); // 9.0
+```
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+
+
+
+
+
+
+The value type of this iterator
+
+
+
+The difference type of this iterator (Default: `ptrdiff_t`)
+
+
+
+
+
+---
+
+## Constructors
+
+### CacheModifiedInputIterator inline
+
+Constructor.
+
+
+```cpp showLineNumbers={false}
+template
+cub::CacheModifiedInputIterator::CacheModifiedInputIterator(
+ QualifiedValueType *ptr
+)
+```
+
+
+---
+
+## Methods
+
+### operator++ inline
+
+
+
+
+Postfix increment.
+
+
+```cpp showLineNumbers={false}
+self_type cub::CacheModifiedInputIterator::operator++(
+ int
+)
+```
+
+
+
+
+
+Prefix increment.
+
+
+```cpp showLineNumbers={false}
+self_type cub::CacheModifiedInputIterator::operator++()
+```
+
+
+
+
+
+### operator* inline const
+
+Indirection.
+
+
+```cpp showLineNumbers={false}
+reference cub::CacheModifiedInputIterator::operator*() const
+```
+
+
+### operator+ inline const
+
+Addition.
+
+
+```cpp showLineNumbers={false}
+template
+self_type cub::CacheModifiedInputIterator::operator+(
+ Distance n
+) const
+```
+
+
+### operator+= inline
+
+Addition assignment.
+
+
+```cpp showLineNumbers={false}
+template
+self_type & cub::CacheModifiedInputIterator::operator+=(
+ Distance n
+)
+```
+
+
+### operator- inline const
+
+
+
+
+Subtraction.
+
+
+```cpp showLineNumbers={false}
+template
+self_type cub::CacheModifiedInputIterator::operator-(
+ Distance n
+) const
+```
+
+
+
+
+
+Distance.
+
+
+```cpp showLineNumbers={false}
+difference_type cub::CacheModifiedInputIterator::operator-(
+ self_type other
+) const
+```
+
+
+
+
+
+### operator-= inline
+
+Subtraction assignment.
+
+
+```cpp showLineNumbers={false}
+template
+self_type & cub::CacheModifiedInputIterator::operator-=(
+ Distance n
+)
+```
+
+
+### operator[] inline const
+
+Array subscript.
+
+
+```cpp showLineNumbers={false}
+template
+reference cub::CacheModifiedInputIterator::operator[](
+ Distance n
+) const
+```
+
+
+### operator-> inline
+
+Structure dereference.
+
+
+```cpp showLineNumbers={false}
+pointer cub::CacheModifiedInputIterator::operator->()
+```
+
+
+### operator== inline const
+
+Equal to.
+
+
+```cpp showLineNumbers={false}
+bool cub::CacheModifiedInputIterator::operator==(
+ const self_type &rhs
+) const
+```
+
+
+### operator!= inline const
+
+Not equal to.
+
+
+```cpp showLineNumbers={false}
+bool cub::CacheModifiedInputIterator::operator!=(
+ const self_type &rhs
+) const
+```
+
+
+### operator<< inline
+
+ostream operator
+
+
+```cpp showLineNumbers={false}
+friend::std::ostream & cub::CacheModifiedInputIterator::operator<<(
+ ::std::ostream &os,
+ const self_type &
+)
+```
+
+
+---
+
+## Types
+
+### Typedefs
+
+| Name | Definition | Description |
+|---|---|---|
+| `self_type` | `CacheModifiedInputIterator` | My own type. |
+| `difference_type` | `OffsetT` | Type to express the result of subtracting one iterator from another. |
+| `value_type` | `ValueType` | The type of the element the iterator can point to. |
+| `pointer` | `ValueType *` | The type of a pointer to an element the iterator can point to. |
+| `reference` | `ValueType` | The type of a reference to an element the iterator can point to. |
+| `iterator_category` | `THRUST_NS_QUALIFIER::detail::iterator_facade_category_t< THRUST_NS_QUALIFIER::device_system_tag, THRUST_NS_QUALIFIER::random_access_traversal_tag >` | |
+
+---
+
+## Member variables
+
+| Name | Type | Description |
+|---|---|---|
+| `ptr` | `ValueType *` | Wrapped native pointer. |
diff --git a/fern/cudapages/cub/cub/cub/CacheModifiedOutputIterator.mdx b/fern/cudapages/cub/cub/cub/CacheModifiedOutputIterator.mdx
new file mode 100644
index 0000000..b0d3ad5
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/CacheModifiedOutputIterator.mdx
@@ -0,0 +1,279 @@
+---
+title: cub::CacheModifiedOutputIterator
+description: "A random-access output wrapper for storing array values using a PTX cache-modifier."
+---
+
+A random-access output wrapper for storing array values using a PTX cache-modifier.
+
+**Overview**
+
+- `CacheModifiedOutputIterator` is a random-access output iterator that wraps a native device pointer of type `ValueType*`. `ValueType` references are made by writing `ValueType` values through stores modified by `MODIFIER`.
+- Can be used to store any data type to memory using PTX cache store modifiers (e.g., "STORE_WB", "STORE_CG", "STORE_CS", "STORE_WT", etc.).
+- Can be constructed, manipulated, and exchanged within and between host and device functions, but can only be dereferenced within device functions.
+- Compatible with Thrust API v1.7 or newer.
+
+**Snippet**
+
+The code snippet below illustrates the use of `CacheModifiedOutputIterator` to dereference a device array of doubles using the "wt" PTX load modifier (i.e., write-through to system memory).
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+// Declare, allocate, and initialize a device array
+double *d_out; // e.g., [, , , , , , ]
+
+// Create an iterator wrapper
+cub::CacheModifiedOutputIterator itr(d_out);
+
+// Within device code:
+itr[0] = 8.0;
+itr[1] = 66.0;
+itr[55] = 24.0;
+```
+
+**Usage Considerations**
+
+- Can only be dereferenced within device code
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+
+
+
+
+
+
+The value type of this iterator
+
+
+
+The difference type of this iterator (Default: `ptrdiff_t`)
+
+
+
+
+
+---
+
+## Constructors
+
+### CacheModifiedOutputIterator inline
+
+
+```cpp showLineNumbers={false}
+template
+cub::CacheModifiedOutputIterator::CacheModifiedOutputIterator(
+ QualifiedValueType *ptr
+)
+```
+
+
+**Parameters**
+
+
+Native pointer to wrap
+
+
+---
+
+## Methods
+
+### operator++ inline
+
+
+
+
+Postfix increment.
+
+
+```cpp showLineNumbers={false}
+self_type cub::CacheModifiedOutputIterator::operator++(
+ int
+)
+```
+
+
+
+
+
+Prefix increment.
+
+
+```cpp showLineNumbers={false}
+self_type cub::CacheModifiedOutputIterator::operator++()
+```
+
+
+
+
+
+### operator* inline const
+
+Indirection.
+
+
+```cpp showLineNumbers={false}
+reference cub::CacheModifiedOutputIterator::operator*() const
+```
+
+
+### operator+ inline const
+
+Addition.
+
+
+```cpp showLineNumbers={false}
+template
+self_type cub::CacheModifiedOutputIterator::operator+(
+ Distance n
+) const
+```
+
+
+### operator+= inline
+
+Addition assignment.
+
+
+```cpp showLineNumbers={false}
+template
+self_type & cub::CacheModifiedOutputIterator::operator+=(
+ Distance n
+)
+```
+
+
+### operator- inline const
+
+
+
+
+Subtraction.
+
+
+```cpp showLineNumbers={false}
+template
+self_type cub::CacheModifiedOutputIterator::operator-(
+ Distance n
+) const
+```
+
+
+
+
+
+Distance.
+
+
+```cpp showLineNumbers={false}
+difference_type cub::CacheModifiedOutputIterator::operator-(
+ self_type other
+) const
+```
+
+
+
+
+
+### operator-= inline
+
+Subtraction assignment.
+
+
+```cpp showLineNumbers={false}
+template
+self_type & cub::CacheModifiedOutputIterator::operator-=(
+ Distance n
+)
+```
+
+
+### operator[] inline const
+
+Array subscript.
+
+
+```cpp showLineNumbers={false}
+template
+reference cub::CacheModifiedOutputIterator::operator[](
+ Distance n
+) const
+```
+
+
+### operator== inline
+
+Equal to.
+
+
+```cpp showLineNumbers={false}
+bool cub::CacheModifiedOutputIterator::operator==(
+ const self_type &rhs
+)
+```
+
+
+### operator!= inline
+
+Not equal to.
+
+
+```cpp showLineNumbers={false}
+bool cub::CacheModifiedOutputIterator::operator!=(
+ const self_type &rhs
+)
+```
+
+
+### operator<< inline
+
+ostream operator
+
+
+```cpp showLineNumbers={false}
+friend::std::ostream & cub::CacheModifiedOutputIterator::operator<<(
+ ::std::ostream &os,
+ const self_type &itr
+)
+```
+
+
+---
+
+## Types
+
+### Typedefs
+
+| Name | Definition | Description |
+|---|---|---|
+| `self_type` | `CacheModifiedOutputIterator` | My own type. |
+| `difference_type` | `OffsetT` | Type to express the result of subtracting one iterator from another. |
+| `value_type` | `void` | The type of the element the iterator can point to. |
+| `pointer` | `void` | The type of a pointer to an element the iterator can point to. |
+| `reference` | `Reference` | The type of a reference to an element the iterator can point to. |
+| `iterator_category` | `THRUST_NS_QUALIFIER::detail::iterator_facade_category_t< THRUST_NS_QUALIFIER::device_system_tag, THRUST_NS_QUALIFIER::random_access_traversal_tag >` | The iterator category. |
+
+---
+
+## Member variables
+
+| Name | Type | Description |
+|---|---|---|
+| `ptr` | `ValueType *` | |
+
+---
+
+## Inner classes
+
+### Reference
+
+
+```cpp showLineNumbers={false}
+struct cub::CacheModifiedOutputIterator::Reference
+```
+
+
+| Name | Type | Description |
+|---|---|---|
+| `ptr` | `ValueType *` | |
diff --git a/fern/cudapages/cub/cub/cub/CachingDeviceAllocator.mdx b/fern/cudapages/cub/cub/cub/CachingDeviceAllocator.mdx
new file mode 100644
index 0000000..94091b9
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/CachingDeviceAllocator.mdx
@@ -0,0 +1,253 @@
+---
+title: cub::CachingDeviceAllocator
+description: "A simple caching allocator for device memory allocations."
+---
+
+A simple caching allocator for device memory allocations.
+
+**Overview**
+
+The allocator is thread-safe and stream-safe and is capable of managing cached device allocations on multiple devices. It behaves as follows:
+
+- Allocations from the allocator are associated with an `active_stream`. Once freed, the allocation becomes available immediately for reuse within the `active_stream` with which it was associated with during allocation, and it becomes available for reuse within other streams when all prior work submitted to `active_stream` has completed.
+- Allocations are categorized and cached by bin size. A new allocation request of a given size will only consider cached allocations within the corresponding bin.
+- Bin limits progress geometrically in accordance with the growth factor `bin_growth` provided during construction. Unused device allocations within a larger bin cache are not reused for allocation requests that categorize to smaller bin sizes.
+- Allocation requests below ( `bin_growth` ^ `min_bin` ) are rounded up to ( `bin_growth` ^ `min_bin` ).
+- Allocations above ( `bin_growth` ^ `max_bin` ) are not rounded up to the nearest bin and are simply freed when they are deallocated instead of being returned to a bin-cache.
+- If the total storage of cached allocations on a given device will exceed `max_cached_bytes`, allocations for that device are simply freed when they are deallocated instead of being returned to their bin-cache.
+
+For example, the default-constructed `CachingDeviceAllocator` is configured with:
+- `bin_growth` = 8
+- `min_bin` = 3
+- `max_bin` = 7
+- `max_cached_bytes` = 6MB - 1B
+
+which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB and sets a maximum of 6,291,455 cached bytes per device
+
+---
+
+## Constructors
+
+### CachingDeviceAllocator inline
+
+
+
+
+Constructor.
+
+
+```cpp showLineNumbers={false}
+cub::CachingDeviceAllocator::CachingDeviceAllocator(
+ unsigned int bin_growth,
+ unsigned int min_bin = 1,
+ unsigned int max_bin = INVALID_BIN,
+ size_t max_cached_bytes = INVALID_SIZE,
+ bool skip_cleanup = false
+)
+```
+
+
+**Parameters**
+
+
+Geometric growth factor for bin-sizes
+
+
+
+Minimum bin (default is bin_growth ^ 1)
+
+
+
+Maximum bin (default is no max bin)
+
+
+
+Maximum aggregate cached bytes per device (default is no limit)
+
+
+
+Whether or not to skip a call to [`FreeAllCached()`](/library/api/cub::_caching_device_allocator::FreeAllCached()) when the destructor is called (default is to deallocate)
+
+
+
+
+
+Default constructor.
+
+Configured with:
+
+- `bin_growth` = 8
+- `min_bin` = 3
+- `max_bin` = 7
+- `max_cached_bytes` = ( `bin_growth` ^ `max_bin`) * 3 ) - 1 = 6,291,455 bytes
+
+which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB and sets a maximum of 6,291,455 cached bytes per device
+
+
+```cpp showLineNumbers={false}
+cub::CachingDeviceAllocator::CachingDeviceAllocator(
+ bool skip_cleanup = false,
+ bool debug = false
+)
+```
+
+
+
+
+
+### Destructor
+
+### ~CachingDeviceAllocator inline virtual
+
+Destructor.
+
+
+```cpp showLineNumbers={false}
+virtual cub::CachingDeviceAllocator::~CachingDeviceAllocator()
+```
+
+
+---
+
+## Methods
+
+### SetMaxCachedBytes inline
+
+Sets the limit on the number bytes this allocator is allowed to cache per device.
+
+Changing the ceiling of cached bytes does not cause any allocations (in-use or cached-in-reserve) to be freed. See [`FreeAllCached()`](/library/api/cub::_caching_device_allocator::FreeAllCached()).
+
+
+```cpp showLineNumbers={false}
+cudaError_t cub::CachingDeviceAllocator::SetMaxCachedBytes(
+ size_t max_cached_bytes_
+)
+```
+
+
+### DeviceAllocate inline
+
+
+
+
+Provides a suitable allocation of device memory for the given size on the specified device.
+
+Once freed, the allocation becomes available immediately for reuse within the `active_stream` with which it was associated with during allocation, and it becomes available for reuse within other streams when all prior work submitted to `active_stream` has completed.
+
+
+```cpp showLineNumbers={false}
+cudaError_t cub::CachingDeviceAllocator::DeviceAllocate(
+ int device,
+ void **d_ptr,
+ size_t bytes,
+ cudaStream_t active_stream = 0
+)
+```
+
+
+**Parameters**
+
+
+Device on which to place the allocation
+
+
+
+Reference to pointer to the allocation
+
+
+
+Minimum number of bytes for the allocation
+
+
+
+The stream to be associated with this allocation
+
+
+
+
+
+Provides a suitable allocation of device memory for the given size on the current device.
+
+Once freed, the allocation becomes available immediately for reuse within the `active_stream` with which it was associated with during allocation, and it becomes available for reuse within other streams when all prior work submitted to `active_stream` has completed.
+
+
+```cpp showLineNumbers={false}
+cudaError_t cub::CachingDeviceAllocator::DeviceAllocate(
+ void **d_ptr,
+ size_t bytes,
+ cudaStream_t active_stream = 0
+)
+```
+
+
+**Parameters**
+
+
+Reference to pointer to the allocation
+
+
+
+Minimum number of bytes for the allocation
+
+
+
+The stream to be associated with this allocation
+
+
+
+
+
+### DeviceFree inline
+
+
+
+
+Frees a live allocation of device memory on the specified device, returning it to the allocator.
+
+Once freed, the allocation becomes available immediately for reuse within the `active_stream` with which it was associated with during allocation, and it becomes available for reuse within other streams when all prior work submitted to `active_stream` has completed.
+
+
+```cpp showLineNumbers={false}
+cudaError_t cub::CachingDeviceAllocator::DeviceFree(
+ int device,
+ void *d_ptr
+)
+```
+
+
+
+
+
+Frees a live allocation of device memory on the current device, returning it to the allocator.
+
+Once freed, the allocation becomes available immediately for reuse within the `active_stream` with which it was associated with during allocation, and it becomes available for reuse within other streams when all prior work submitted to `active_stream` has completed.
+
+
+```cpp showLineNumbers={false}
+cudaError_t cub::CachingDeviceAllocator::DeviceFree(
+ void *d_ptr
+)
+```
+
+
+
+
+
+### FreeAllCached inline
+
+Frees all cached device allocations on all devices.
+
+
+```cpp showLineNumbers={false}
+cudaError_t cub::CachingDeviceAllocator::FreeAllCached()
+```
+
+
+---
+
+## Member variables
+
+| Name | Type | Description |
+|---|---|---|
+| `INVALID_BIN` static constexpr | `unsigned int` | Out-of-bounds bin. |
+| `INVALID_SIZE` static constexpr | `size_t` | Invalid size. |
diff --git a/fern/cudapages/cub/cub/cub/CastOp.mdx b/fern/cudapages/cub/cub/cub/CastOp.mdx
new file mode 100644
index 0000000..acb2e6e
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/CastOp.mdx
@@ -0,0 +1,32 @@
+---
+title: cub::CastOp
+description: "Default cast functor."
+---
+
+Default cast functor.
+
+
+
+
+
+
+
+
+
+
+---
+
+## Methods
+
+### operator() inline const
+
+Cast operator, returns `(B) a`.
+
+
+```cpp showLineNumbers={false}
+template
+B cub::CastOp::operator()(
+ A &&a
+) const
+```
+
diff --git a/fern/cudapages/cub/cub/cub/ChainedPolicy.mdx b/fern/cudapages/cub/cub/cub/ChainedPolicy.mdx
new file mode 100644
index 0000000..9ddf549
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/ChainedPolicy.mdx
@@ -0,0 +1,84 @@
+---
+title: cub::ChainedPolicy
+description: "Helper for dispatching into a policy chain."
+---
+
+Helper for dispatching into a policy chain.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+
+## Methods
+
+### runtime_arch_to_compiletime inline static
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::ChainedPolicy::runtime_arch_to_compiletime(
+ int device_ptx_version,
+ FunctorT &op
+)
+```
+
+
+### find_and_invoke_policy inline static
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::ChainedPolicy::find_and_invoke_policy(
+ FunctorT &op
+)
+```
+
+
+---
+
+## Static methods
+
+### Invoke inline static
+
+Specializes and dispatches op in accordance to the first policy in the chain of adequate PTX version.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::ChainedPolicy::Invoke(
+ int device_ptx_version,
+ FunctorT &op
+)
+```
+
+
+---
+
+## Types
+
+### Typedefs
+
+| Name | Definition | Description |
+|---|---|---|
+| `ActivePolicy` | `typename ::cuda::std::_If<(CUB_PTX_ARCH< PolicyPtxVersion &&have_previous_policy), detail::get_active_policy< PrevPolicyT >, ::cuda::std::type_identity< PolicyT > >::type` | The policy for the active compiler pass. |
+
+---
+
+## Member variables
+
+| Name | Type | Description |
+|---|---|---|
+| `have_previous_policy` static constexpr | `bool` | |
diff --git a/fern/cudapages/cub/cub/cub/DeviceAdjacentDifference.mdx b/fern/cudapages/cub/cub/cub/DeviceAdjacentDifference.mdx
new file mode 100644
index 0000000..e7914dc
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/DeviceAdjacentDifference.mdx
@@ -0,0 +1,524 @@
+---
+title: cub::DeviceAdjacentDifference
+description: ""
+---
+
+DeviceAdjacentDifference provides device-wide, parallel operations for computing the differences of adjacent elements residing within device-accessible memory.
+
+## Example
+
+The code snippet below illustrates how to use `DeviceAdjacentDifference` to compute the left difference between adjacent elements.
+
+```cpp showLineNumbers={false}
+#include
+// or equivalently
+
+// Declare, allocate, and initialize device-accessible pointers
+int num_items; // e.g., 8
+int *d_values; // e.g., [1, 2, 1, 2, 1, 2, 1, 2]
+//...
+
+// Determine temporary device storage requirements
+void *d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+
+cub::DeviceAdjacentDifference::SubtractLeft(
+ d_temp_storage, temp_storage_bytes, d_values, num_items);
+
+// Allocate temporary storage
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Run operation
+cub::DeviceAdjacentDifference::SubtractLeft(
+ d_temp_storage, temp_storage_bytes, d_values, num_items);
+
+// d_values <-- [1, 1, -1, 1, -1, 1, -1, 1]
+```
+
+---
+
+## Methods
+
+### AdjacentDifference inline static
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceAdjacentDifference::AdjacentDifference(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ InputIteratorT d_input,
+ OutputIteratorT d_output,
+ NumItemsT num_items,
+ DifferenceOpT difference_op,
+ cudaStream_t stream
+)
+```
+
+
+---
+
+## Static methods
+
+### SubtractLeftCopy inline static
+
+Subtracts the left element of each adjacent pair of elements residing within device-accessible memory
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceAdjacentDifference::SubtractLeftCopy(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ InputIteratorT d_input,
+ OutputIteratorT d_output,
+ NumItemsT num_items,
+ DifferenceOpT difference_op = {},
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+**Template parameters**
+
+
+Embed:rst:leading-asterisk
+//! is a model of `Input Iterator `_,
+//! and ``x`` and ``y`` are objects of ``InputIteratorT``'s ``value_type``, then
+//! ``x - y`` is defined, and ``InputIteratorT``'s ``value_type`` is convertible to
+//! a type in ``OutputIteratorT``'s set of ``value_types``, and the return type
+//! of ``x - y`` is convertible to a type in ``OutputIteratorT``'s set of
+//! ``value_types``.
+//!
+
+
+
+Embed:rst:leading-asterisk
+//! is a model of `Output Iterator `_.
+//!
+
+
+
+Its `result_type` is convertible to a type in `OutputIteratorT`'s set of `value_types`.
+
+
+
+**[inferred]** Type of num_items
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Pointer to the input sequence
+
+
+
+Pointer to the output sequence
+
+
+
+Number of items in the input sequence
+
+
+
+The binary function used to compute differences
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`
+//!
+
+
+**Example**
+
+The code snippet below illustrates how to use `DeviceAdjacentDifference` to compute the difference between adjacent elements.
+
+```cpp showLineNumbers={false}
+#include
+// or equivalently
+
+struct CustomDifference
+{
+ template
+ __host__ DataType operator()(DataType &lhs, DataType &rhs)
+ {
+ return lhs - rhs;
+ }
+};
+
+// Declare, allocate, and initialize device-accessible pointers
+int num_items; // e.g., 8
+int *d_input; // e.g., [1, 2, 1, 2, 1, 2, 1, 2]
+int *d_output;
+...
+
+// Determine temporary device storage requirements
+void *d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+
+cub::DeviceAdjacentDifference::SubtractLeftCopy(
+ d_temp_storage, temp_storage_bytes,
+ d_input, d_output,
+ num_items, CustomDifference());
+
+// Allocate temporary storage
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Run operation
+cub::DeviceAdjacentDifference::SubtractLeftCopy(
+ d_temp_storage, temp_storage_bytes,
+ d_input, d_output,
+ num_items, CustomDifference());
+
+// d_input <-- [1, 2, 1, 2, 1, 2, 1, 2]
+// d_output <-- [1, 1, -1, 1, -1, 1, -1, 1]
+```
+
+### SubtractLeft inline static
+
+Subtracts the left element of each adjacent pair of elements residing within device-accessible memory.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceAdjacentDifference::SubtractLeft(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ RandomAccessIteratorT d_input,
+ NumItemsT num_items,
+ DifferenceOpT difference_op = {},
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+**Template parameters**
+
+
+Embed:rst:leading-asterisk
+//! is a model of `Random Access Iterator `_,
+//! ``RandomAccessIteratorT`` is mutable. If ``x`` and ``y`` are objects of
+//! ``RandomAccessIteratorT``'s ``value_type``, and ``x - y`` is defined, then the
+//! return type of ``x - y`` should be convertible to a type in
+//! ``RandomAccessIteratorT``'s set of ``value_types``.
+//!
+
+
+
+Its `result_type` is convertible to a type in `RandomAccessIteratorT`'s set of `value_types`.
+
+
+
+**[inferred]** Type of `num_items`
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Pointer to the input sequence and the result
+
+
+
+Number of items in the input sequence
+
+
+
+The binary function used to compute differences
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+**Example**
+
+The code snippet below illustrates how to use `DeviceAdjacentDifference` to compute the difference between adjacent elements.
+
+```cpp showLineNumbers={false}
+#include
+// or equivalently
+
+struct CustomDifference
+{
+ template
+ __host__ DataType operator()(DataType &lhs, DataType &rhs)
+ {
+ return lhs - rhs;
+ }
+};
+
+// Declare, allocate, and initialize device-accessible pointers
+int num_items; // e.g., 8
+int *d_data; // e.g., [1, 2, 1, 2, 1, 2, 1, 2]
+...
+
+// Determine temporary device storage requirements
+void *d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+cub::DeviceAdjacentDifference::SubtractLeft(
+ d_temp_storage, temp_storage_bytes,
+ d_data, num_items, CustomDifference());
+
+// Allocate temporary storage
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Run operation
+cub::DeviceAdjacentDifference::SubtractLeft(
+ d_temp_storage, temp_storage_bytes,
+ d_data, num_items, CustomDifference());
+
+// d_data <-- [1, 1, -1, 1, -1, 1, -1, 1]
+```
+
+### SubtractRightCopy inline static
+
+Subtracts the right element of each adjacent pair of elements residing within device-accessible memory.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceAdjacentDifference::SubtractRightCopy(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ InputIteratorT d_input,
+ OutputIteratorT d_output,
+ NumItemsT num_items,
+ DifferenceOpT difference_op = {},
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+**Template parameters**
+
+
+Embed:rst:leading-asterisk
+//! is a model of `Input Iterator `_,
+//! and ``x`` and ``y`` are objects of ``InputIteratorT``'s ``value_type``, then
+//! ``x - y`` is defined, and ``InputIteratorT``'s ``value_type`` is convertible to
+//! a type in ``OutputIteratorT``'s set of ``value_types``, and the return type
+//! of ``x - y`` is convertible to a type in ``OutputIteratorT``'s set of
+//! ``value_types``.
+//!
+
+
+
+Embed:rst:leading-asterisk
+//! is a model of `Output Iterator `_.
+//!
+
+
+
+Its `result_type` is convertible to a type in `RandomAccessIteratorT`'s set of `value_types`.
+
+
+
+**[inferred]** Type of num_items
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Pointer to the input sequence
+
+
+
+Pointer to the output sequence
+
+
+
+Number of items in the input sequence
+
+
+
+The binary function used to compute differences.
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+**Example**
+
+The code snippet below illustrates how to use `DeviceAdjacentDifference` to compute the difference between adjacent elements.
+
+```cpp showLineNumbers={false}
+#include
+// or equivalently
+
+struct CustomDifference
+{
+ template
+ __host__ DataType operator()(DataType &lhs, DataType &rhs)
+ {
+ return lhs - rhs;
+ }
+};
+
+// Declare, allocate, and initialize device-accessible pointers
+int num_items; // e.g., 8
+int *d_input; // e.g., [1, 2, 1, 2, 1, 2, 1, 2]
+int *d_output;
+..
+
+// Determine temporary device storage requirements
+void *d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+cub::DeviceAdjacentDifference::SubtractRightCopy(
+ d_temp_storage, temp_storage_bytes,
+ d_input, d_output, num_items, CustomDifference());
+
+// Allocate temporary storage
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Run operation
+cub::DeviceAdjacentDifference::SubtractRightCopy(
+ d_temp_storage, temp_storage_bytes,
+ d_input, d_output, num_items, CustomDifference());
+
+// d_input <-- [1, 2, 1, 2, 1, 2, 1, 2]
+// d_data <-- [-1, 1, -1, 1, -1, 1, -1, 2]
+```
+
+### SubtractRight inline static
+
+Subtracts the right element of each adjacent pair of elements residing within device-accessible memory.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceAdjacentDifference::SubtractRight(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ RandomAccessIteratorT d_input,
+ NumItemsT num_items,
+ DifferenceOpT difference_op = {},
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+**Template parameters**
+
+
+Embed:rst:leading-asterisk
+//! is a model of `Random Access Iterator `_,
+//! ``RandomAccessIteratorT`` is mutable. If ``x`` and ``y`` are objects of
+//! ``RandomAccessIteratorT``'s `value_type`, and ``x - y`` is defined, then the
+//! return type of ``x - y`` should be convertible to a type in
+//! ``RandomAccessIteratorT``'s set of ``value_types``.
+//!
+
+
+
+Its `result_type` is convertible to a type in `RandomAccessIteratorT`'s set of `value_types`.
+
+
+
+**[inferred]** Type of num_items
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Pointer to the input sequence
+
+
+
+Number of items in the input sequence
+
+
+
+The binary function used to compute differences
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+**Example**
+
+The code snippet below illustrates how to use `DeviceAdjacentDifference` to compute the difference between adjacent elements.
+
+```cpp showLineNumbers={false}
+#include
+// or equivalently
+
+// Declare, allocate, and initialize device-accessible pointers
+int num_items; // e.g., 8
+int *d_data; // e.g., [1, 2, 1, 2, 1, 2, 1, 2]
+...
+
+// Determine temporary device storage requirements
+void *d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+cub::DeviceAdjacentDifference::SubtractRight(
+ d_temp_storage, temp_storage_bytes, d_data, num_items);
+
+// Allocate temporary storage
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Run operation
+cub::DeviceAdjacentDifference::SubtractRight(
+ d_temp_storage, temp_storage_bytes, d_data, num_items);
+
+// d_data <-- [-1, 1, -1, 1, -1, 1, -1, 2]
+```
diff --git a/fern/cudapages/cub/cub/cub/DeviceCopy.mdx b/fern/cudapages/cub/cub/cub/DeviceCopy.mdx
new file mode 100644
index 0000000..345a2cf
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/DeviceCopy.mdx
@@ -0,0 +1,235 @@
+---
+title: cub::DeviceCopy
+description: "[cub::DeviceCopy](/library/api/cub::_device_copy) provides device-wide, parallel operations for copying data."
+---
+
+`cub::DeviceCopy` provides device-wide, parallel operations for copying data.
+
+---
+
+## Static methods
+
+### Batched inline static
+
+Copies data from a batch of given source ranges to their corresponding destination ranges.
+
+.. note::
+
+If any input range aliases any output range the behavior is undefined. If any output range aliases another output range the behavior is undefined. Input ranges can alias one another.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceCopy::Batched(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ InputIt input_it,
+ OutputIt output_it,
+ SizeIteratorT sizes,
+ ::cuda::std::int64_t num_ranges,
+ cudaStream_t stream = nullptr
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+**Template parameters**
+
+
+**[inferred]** Device-accessible random-access input iterator type providing the iterators to the source ranges
+
+
+
+**[inferred]** Device-accessible random-access input iterator type providing the iterators to the destination ranges
+
+
+
+**[inferred]** Device-accessible random-access input iterator type providing the number of items to be copied for each pair of ranges
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Device-accessible iterator providing the iterators to the source ranges
+
+
+
+Device-accessible iterator providing the iterators to the destination ranges
+
+
+
+Device-accessible iterator providing the number of elements to be copied for each pair of ranges
+
+
+
+The total number of range pairs
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+**Example**
+
+The code snippet below illustrates usage of DeviceCopy::Batched to perform a DeviceRunLength Decode operation.
+
+```cpp showLineNumbers={false}
+struct GetIteratorToRange
+{
+ __host__ __device__ __forceinline__ auto operator()(uint32_t index)
+ {
+ return thrust::make_constant_iterator(d_data_in[index]);
+ }
+ int32_t *d_data_in;
+};
+
+struct GetPtrToRange
+{
+ __host__ __device__ __forceinline__ auto operator()(uint32_t index)
+ {
+ return d_data_out + d_offsets[index];
+ }
+ int32_t *d_data_out;
+ uint32_t *d_offsets;
+};
+
+struct GetRunLength
+{
+ __host__ __device__ __forceinline__ uint32_t operator()(uint32_t index)
+ {
+ return d_offsets[index + 1] - d_offsets[index];
+ }
+ uint32_t *d_offsets;
+};
+
+uint32_t num_ranges = 5;
+int32_t *d_data_in; // e.g., [4, 2, 7, 3, 1]
+int32_t *d_data_out; // e.g., [0, ... ]
+uint32_t *d_offsets; // e.g., [0, 2, 5, 6, 9, 14]
+
+// Returns a constant iterator to the element of the i-th run
+thrust::counting_iterator iota(0);
+auto iterators_in = thrust::make_transform_iterator(iota, GetIteratorToRange{d_data_in});
+
+// Returns the run length of the i-th run
+auto sizes = thrust::make_transform_iterator(iota, GetRunLength{d_offsets});
+
+// Returns pointers to the output range for each run
+auto ptrs_out = thrust::make_transform_iterator(iota, GetPtrToRange{d_data_out, d_offsets});
+
+// Determine temporary device storage requirements
+void *d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+cub::DeviceCopy::Batched(d_temp_storage, temp_storage_bytes, iterators_in, ptrs_out, sizes,
+num_ranges);
+
+// Allocate temporary storage
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Run batched copy algorithm (used to perform runlength decoding)
+cub::DeviceCopy::Batched(d_temp_storage, temp_storage_bytes, iterators_in, ptrs_out, sizes,
+num_ranges);
+
+// d_data_out <-- [4, 4, 2, 2, 2, 7, 3, 3, 3, 1, 1, 1, 1, 1]
+```
+
+### Copy inline static nodiscard
+
+Copies data from a multidimensional source mdspan to a destination mdspan.
+
+This function performs a parallel copy operation between two mdspan objects with potentially different layouts but identical extents. The copy operation handles arbitrary-dimensional arrays and automatically manages layout transformations.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceCopy::Copy(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ ::cuda::std::mdspan mdspan_in,
+ ::cuda::std::mdspan mdspan_out,
+ ::cudaStream_t stream = nullptr
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+**Returns:** embed:rst:leading-asterisk
+//! **cudaSuccess** on success, **cudaErrorInvalidValue** if mdspan extents don't match, or error code on failure
+//!
+
+**Template parameters**
+
+
+**[inferred]** The element type of the source mdspan
+
+
+
+**[inferred]** The extents type of the source mdspan
+
+
+
+**[inferred]** The layout type of the source mdspan
+
+
+
+**[inferred]** The accessor type of the source mdspan
+
+
+
+**[inferred]** The element type of the destination mdspan
+
+
+
+**[inferred]** The extents type of the destination mdspan
+
+
+
+**[inferred]** The layout type of the destination mdspan
+
+
+
+**[inferred]** The accessor type of the destination mdspan
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Source mdspan containing the data to be copied
+
+
+
+Destination mdspan where the data will be copied
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
diff --git a/fern/cudapages/cub/cub/cub/DeviceFind.mdx b/fern/cudapages/cub/cub/cub/DeviceFind.mdx
new file mode 100644
index 0000000..d777461
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/DeviceFind.mdx
@@ -0,0 +1,270 @@
+---
+title: cub::DeviceFind
+description: ""
+---
+
+---
+
+## Static methods
+
+### FindIf inline static
+
+Finds the first element in the input sequence that satisfies the given predicate.
+
+The code snippet below illustrates the finding of the first element that satisfies the predicate.
+
+.. literalinclude:: ../../../cub/test/catch2_test_device_find_if_api.cu :language: c++ :dedent: :start-after: example-begin find-if-predicate :end-before: example-end find-if-predicate
+
+.. literalinclude:: ../../../cub/test/catch2_test_device_find_if_api.cu :language: c++ :dedent: :start-after: example-begin device-find-if :end-before: example-end device-find-if
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceFind::FindIf(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ InputIteratorT d_in,
+ OutputIteratorT d_out,
+ ScanOpT scan_op,
+ NumItemsT num_items,
+ cudaStream_t stream = 0
+)
+```
+
+
+
+The search terminates at the first element where the predicate evaluates to true.
+The index of the found element is written to `d_out`.
+If no element satisfies the predicate, `num_items` is written to `d_out`.
+The range `[d_out, d_out + 1)` shall not overlap `[d_in, d_in + num_items)` in any way.
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access input iterator type for reading input items (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access output iterator type for writing the result index (may be a simple pointer type)
+
+
+
+**[inferred]** Unary predicate functor type having member `bool operator()(const T &a)`
+
+
+
+**[inferred]** An integral type representing the number of input elements
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Random-access iterator to the input sequence of data items
+
+
+
+Random-access iterator to the output location for the index of the found element
+
+
+
+Unary predicate functor for determining whether an element satisfies the search condition
+
+
+
+Total number of input items (i.e., the length of `d_in`)
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+### LowerBound inline static
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceFind::LowerBound(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ RangeIteratorT d_range,
+ RangeNumItemsT range_num_items,
+ ValuesIteratorT d_values,
+ ValuesNumItemsT values_num_items,
+ OutputIteratorT d_output,
+ CompareOpT comp,
+ cudaStream_t stream = 0
+)
+```
+
+
+**Template parameters**
+
+
+Is a model of [Random Access Iterator](https://en.cppreference.com/w/cpp/iterator/random_access_iterator), whose value type forms a [Relation](https://en.cppreference.com/w/cpp/concepts/relation) with the value type of `ValuesIteratorT` using `CompareOpT` as the predicate.
+
+
+
+Is an integral type representing the number of elements in the range to be searched.
+
+
+
+Is a model of [Random Access Iterator](https://en.cppreference.com/w/cpp/iterator/random_access_iterator), whose value type forms a [Relation](https://en.cppreference.com/w/cpp/concepts/relation) with the value type of `RangeIteratorT` using `CompareOpT` as the predicate.
+
+
+
+Is a model of integral type representing the number of elements in the range of values to be searched for.
+
+
+
+Is a model of [Random Access Iterator](https://en.cppreference.com/w/cpp/iterator/random_access_iterator), whose value type is assignable from `RangeIteratorT`'s difference type.
+
+
+
+Is a model of [Strict Weak Ordering](https://en.cppreference.com/w/cpp/concepts/strict_weak_order), which forms a [Relation](https://en.cppreference.com/w/cpp/concepts/relation) with the value types of `RangeIteratorT` and `ValuesIteratorT`.
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Iterator to the beginning of the ordered range to be searched.
+
+
+
+Number of elements in the ordered range to be searched.
+
+
+
+Iterator to the beginning of the range of values to be searched for.
+
+
+
+Number of elements in the range of values to be searched for.
+
+
+
+Iterator to the beginning of the output range.
+
+
+
+Comparison function object which returns true if its first argument is ordered before the second in the [Strict Weak Ordering](https://en.cppreference.com/w/cpp/concepts/strict_weak_order) of the range to be searched.
+
+
+
+**[optional]** CUDA stream to launch kernels within. Default is stream0.
+
+
+### UpperBound inline static
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceFind::UpperBound(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ RangeIteratorT d_range,
+ RangeNumItemsT range_num_items,
+ ValuesIteratorT d_values,
+ ValuesNumItemsT values_num_items,
+ OutputIteratorT d_output,
+ CompareOpT comp,
+ cudaStream_t stream = 0
+)
+```
+
+
+**Template parameters**
+
+
+Is a model of [Random Access Iterator](https://en.cppreference.com/w/cpp/iterator/random_access_iterator), whose value type forms a [Relation](https://en.cppreference.com/w/cpp/concepts/relation) with the value type of `ValuesIteratorT` using `CompareOpT` as the predicate.
+
+
+
+Is an integral type representing the number of elements in the range to be searched.
+
+
+
+Is a model of [Random Access Iterator](https://en.cppreference.com/w/cpp/iterator/random_access_iterator), whose value type forms a [Relation](https://en.cppreference.com/w/cpp/concepts/relation) with the value type of `RangeIteratorT` using `CompareOpT` as the predicate.
+
+
+
+Is a model of integral type representing the number of elements in the range of values to be searched for.
+
+
+
+Is a model of [Random Access Iterator](https://en.cppreference.com/w/cpp/iterator/random_access_iterator), whose value type is assignable from `RangeIteratorT`'s difference type.
+
+
+
+Is a model of [Strict Weak Ordering](https://en.cppreference.com/w/cpp/concepts/strict_weak_order), which forms a [Relation](https://en.cppreference.com/w/cpp/concepts/relation) with the value types of `RangeIteratorT` and `ValuesIteratorT`.
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Iterator to the beginning of the ordered range to be searched.
+
+
+
+Number of elements in the ordered range to be searched.
+
+
+
+Iterator to the beginning of the range of values to be searched for.
+
+
+
+Number of elements in the range of values to be searched for.
+
+
+
+Iterator to the beginning of the output range.
+
+
+
+Comparison function object which returns true if its first argument is ordered before the second in the [Strict Weak Ordering](https://en.cppreference.com/w/cpp/concepts/strict_weak_order) of the range to be searched.
+
+
+
+**[optional]** CUDA stream to launch kernels within. Default is stream0.
+
diff --git a/fern/cudapages/cub/cub/cub/DeviceFor.mdx b/fern/cudapages/cub/cub/cub/DeviceFor.mdx
new file mode 100644
index 0000000..06c81f1
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/DeviceFor.mdx
@@ -0,0 +1,728 @@
+---
+title: cub::DeviceFor
+description: ""
+---
+
+---
+
+## Methods
+
+### for_each_n inline static
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceFor::for_each_n(
+ RandomAccessOrContiguousIteratorT first,
+ OffsetT num_items,
+ OpT op,
+ cudaStream_t stream
+)
+```
+
+
+### ForEachNNoNVTX inline static
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceFor::ForEachNNoNVTX(
+ RandomAccessIteratorT first,
+ NumItemsT num_items,
+ OpT op,
+ cudaStream_t stream = {}
+)
+```
+
+
+### ForEachCopyNNoNVTX inline static
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceFor::ForEachCopyNNoNVTX(
+ RandomAccessIteratorT first,
+ NumItemsT num_items,
+ OpT op,
+ cudaStream_t stream = {}
+)
+```
+
+
+---
+
+## Static methods
+
+### Bulk inline static
+
+
+
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceFor::Bulk(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ ShapeT shape,
+ OpT op,
+ cudaStream_t stream = {}
+)
+```
+
+
+*Added in v2.4.0. First appears in CUDA Toolkit 12.5.*
+
+**Template parameters**
+
+
+Is an integral type
+
+
+
+Is a model of [Unary Function](https://en.cppreference.com/w/cpp/utility/functional/unary_function)
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Shape of the index space to iterate over
+
+
+
+Function object to apply to each index in the index space
+
+
+
+CUDA stream to launch kernels within. Default stream is `0`.
+
+
+
+
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceFor::Bulk(
+ ShapeT shape,
+ OpT op,
+ cudaStream_t stream = {}
+)
+```
+
+
+*Added in v2.4.0. First appears in CUDA Toolkit 12.5.*
+
+**Template parameters**
+
+
+Is an integral type
+
+
+
+Is a model of [Unary Function](https://en.cppreference.com/w/cpp/utility/functional/unary_function)
+
+
+**Parameters**
+
+
+Shape of the index space to iterate over
+
+
+
+Function object to apply to each index in the index space
+
+
+
+CUDA stream to launch kernels within. Default stream is `0`.
+
+
+
+
+
+### ForEachN inline static
+
+
+
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceFor::ForEachN(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ RandomAccessIteratorT first,
+ NumItemsT num_items,
+ OpT op,
+ cudaStream_t stream = {}
+)
+```
+
+
+*Added in v2.4.0. First appears in CUDA Toolkit 12.5.*
+
+**Template parameters**
+
+
+Is a model of Random Access Iterator whose value type is convertible to `op`'s argument type.
+
+
+
+Is an integral type representing the number of elements to iterate over
+
+
+
+Is a model of [Unary Function](https://en.cppreference.com/w/cpp/utility/functional/unary_function)
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+The beginning of the sequence
+
+
+
+Number of elements to iterate over
+
+
+
+Function object to apply to each element in the range
+
+
+
+CUDA stream to launch kernels within. Default stream is `0`.
+
+
+
+
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceFor::ForEachN(
+ RandomAccessIteratorT first,
+ NumItemsT num_items,
+ OpT op,
+ cudaStream_t stream = {}
+)
+```
+
+
+*Added in v2.4.0. First appears in CUDA Toolkit 12.5.*
+
+**Template parameters**
+
+
+Is a model of Random Access Iterator whose value type is convertible to `op`'s argument type.
+
+
+
+Is an integral type representing the number of elements to iterate over
+
+
+
+Is a model of [Unary Function](https://en.cppreference.com/w/cpp/utility/functional/unary_function)
+
+
+**Parameters**
+
+
+The beginning of the sequence
+
+
+
+Number of elements to iterate over
+
+
+
+Function object to apply to each element in the range
+
+
+
+CUDA stream to launch kernels within. Default stream is `0`.
+
+
+
+
+
+### ForEach inline static
+
+
+
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceFor::ForEach(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ RandomAccessIteratorT first,
+ RandomAccessIteratorT last,
+ OpT op,
+ cudaStream_t stream = {}
+)
+```
+
+
+*Added in v2.4.0. First appears in CUDA Toolkit 12.5.*
+
+**Template parameters**
+
+
+Is a model of Random Access Iterator whose value type is convertible to `op`'s argument type.
+
+
+
+Is a model of [Unary Function](https://en.cppreference.com/w/cpp/utility/functional/unary_function)
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+The beginning of the sequence
+
+
+
+The end of the sequence
+
+
+
+Function object to apply to each element in the range
+
+
+
+CUDA stream to launch kernels within. Default stream is `0`.
+
+
+
+
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceFor::ForEach(
+ RandomAccessIteratorT first,
+ RandomAccessIteratorT last,
+ OpT op,
+ cudaStream_t stream = {}
+)
+```
+
+
+*Added in v2.4.0. First appears in CUDA Toolkit 12.5.*
+
+**Template parameters**
+
+
+Is a model of Random Access Iterator whose value type is convertible to `op`'s argument type.
+
+
+
+Is a model of [Unary Function](https://en.cppreference.com/w/cpp/utility/functional/unary_function)
+
+
+**Parameters**
+
+
+The beginning of the sequence
+
+
+
+The end of the sequence
+
+
+
+Function object to apply to each element in the range
+
+
+
+CUDA stream to launch kernels within. Default stream is `0`.
+
+
+
+
+
+### ForEachCopyN inline static
+
+
+
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceFor::ForEachCopyN(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ RandomAccessIteratorT first,
+ NumItemsT num_items,
+ OpT op,
+ cudaStream_t stream = {}
+)
+```
+
+
+*Added in v2.4.0. First appears in CUDA Toolkit 12.5.*
+
+**Template parameters**
+
+
+Is a model of Random Access Iterator whose value type is convertible to `op`'s argument type.
+
+
+
+Is an integral type representing the number of elements to iterate over
+
+
+
+Is a model of [Unary Function](https://en.cppreference.com/w/cpp/utility/functional/unary_function)
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+The beginning of the sequence
+
+
+
+Number of elements to iterate over
+
+
+
+Function object to apply to a copy of each element in the range
+
+
+
+CUDA stream to launch kernels within. Default stream is `0`.
+
+
+
+
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceFor::ForEachCopyN(
+ RandomAccessIteratorT first,
+ NumItemsT num_items,
+ OpT op,
+ cudaStream_t stream = {}
+)
+```
+
+
+*Added in v2.4.0. First appears in CUDA Toolkit 12.5.*
+
+**Template parameters**
+
+
+Is a model of Random Access Iterator whose value type is convertible to `op`'s argument type.
+
+
+
+Is an integral type representing the number of elements to iterate over
+
+
+
+Is a model of [Unary Function](https://en.cppreference.com/w/cpp/utility/functional/unary_function)
+
+
+**Parameters**
+
+
+The beginning of the sequence
+
+
+
+Number of elements to iterate over
+
+
+
+Function object to apply to a copy of each element in the range
+
+
+
+CUDA stream to launch kernels within. Default stream is `0`.
+
+
+
+
+
+### ForEachCopy inline static
+
+
+
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceFor::ForEachCopy(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ RandomAccessIteratorT first,
+ RandomAccessIteratorT last,
+ OpT op,
+ cudaStream_t stream = {}
+)
+```
+
+
+*Added in v2.4.0. First appears in CUDA Toolkit 12.5.*
+
+**Template parameters**
+
+
+Is a model of Random Access Iterator whose value type is convertible to `op`'s argument type.
+
+
+
+Is a model of [Unary Function](https://en.cppreference.com/w/cpp/utility/functional/unary_function)
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+The beginning of the sequence
+
+
+
+The end of the sequence
+
+
+
+Function object to apply to a copy of each element in the range
+
+
+
+CUDA stream to launch kernels within. Default stream is `0`.
+
+
+
+
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceFor::ForEachCopy(
+ RandomAccessIteratorT first,
+ RandomAccessIteratorT last,
+ OpT op,
+ cudaStream_t stream = {}
+)
+```
+
+
+*Added in v2.4.0. First appears in CUDA Toolkit 12.5.*
+
+**Template parameters**
+
+
+Is a model of Random Access Iterator whose value type is convertible to `op`'s argument type.
+
+
+
+Is a model of [Unary Function](https://en.cppreference.com/w/cpp/utility/functional/unary_function)
+
+
+**Parameters**
+
+
+The beginning of the sequence
+
+
+
+The end of the sequence
+
+
+
+Function object to apply to a copy of each element in the range
+
+
+
+CUDA stream to launch kernels within. Default stream is `0`.
+
+
+
+
+
+### ForEachInExtents inline static
+
+
+
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceFor::ForEachInExtents(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ const ::cuda::std::extents &extents,
+ OpType op,
+ cudaStream_t stream = {}
+)
+```
+
+
+*Added in v2.4.0. First appears in CUDA Toolkit 12.5.*
+
+**Returns:** cudaError_t error status
+
+**Template parameters**
+
+
+Is an integral type that represents the extent index space (automatically deduced)
+
+
+
+Are the extent sizes for each rank index (automatically deduced)
+
+
+
+Is a function object with arity equal to the number of extents + 1 for the linear index (iteration)
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Extents object that represents a multi-dimensional index space
+
+
+
+Function object to apply to each linear index (iteration) and multi-dimensional coordinates
+
+
+
+CUDA stream to launch kernels within. Default stream is `NULL`
+
+
+
+
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceFor::ForEachInExtents(
+ const ::cuda::std::extents &extents,
+ OpType op,
+ cudaStream_t stream = {}
+)
+```
+
+
+*Added in v2.4.0. First appears in CUDA Toolkit 12.5.*
+
+**Returns:** cudaError_t error status
+
+**Template parameters**
+
+
+Is an integral type that represents the extent index space (automatically deduced)
+
+
+
+Are the extent sizes for each rank index (automatically deduced)
+
+
+
+Is a function object with arity equal to the number of extents + 1 for the linear index (iteration)
+
+
+**Parameters**
+
+
+Extents object that represents a multi-dimensional index space
+
+
+
+Function object to apply to each linear index (iteration) and multi-dimensional coordinates
+
+
+
+CUDA stream to launch kernels within. Default stream is `NULL`
+
+
+
+
+
+### ForEachInLayout inline static nodiscard
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceFor::ForEachInLayout(
+ const LayoutMapping &layout_mapping,
+ OpType op,
+ cudaStream_t stream = {}
+)
+```
+
+
+*Added in v2.4.0. First appears in CUDA Toolkit 12.5.*
+
+**Returns:** cudaError_t error status
+
+**Template parameters**
+
+
+**[inferred]** A function object with arity equal to the number of extents + 1 for the linear index (iteration). The first parameter is the linear index, followed by one parameter for each dimension coordinate.
+
+
+**Parameters**
+
+
+Function object to apply to each linear index (iteration) and multi-dimensional coordinates. Called as `op(linear_index, coord_0, coord_1, ..., coord_n)`
+
+
+
+CUDA stream to launch kernels within. Default stream is `nullptr`
+
diff --git a/fern/cudapages/cub/cub/cub/DeviceHistogram.mdx b/fern/cudapages/cub/cub/cub/DeviceHistogram.mdx
new file mode 100644
index 0000000..60fde64
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/DeviceHistogram.mdx
@@ -0,0 +1,1313 @@
+---
+title: cub::DeviceHistogram
+description: ""
+---
+
+DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of samples data residing within device-accessible memory.
+
+---
+
+## Evenly-segmented bin ranges
+
+### HistogramEven inline static
+
+
+
+
+Computes an intensity histogram from a sequence of data samples using equal-width bins.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceHistogram::HistogramEven(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ SampleIteratorT d_samples,
+ CounterT *d_histogram,
+ int num_levels,
+ LevelT lower_level,
+ LevelT upper_level,
+ OffsetT num_samples,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The number of histogram bins is (`num_levels - 1`)
+All bins comprise the same width of sample values: `(upper_level - lower_level) / (num_levels - 1)`.
+If the common type of `SampleT` and `LevelT` is of integral type, the bin for a sample is computed as `(sample - lower_level) * (num_levels - 1) / (upper_level - lower_level)`, round down to the nearest whole number. To protect against potential overflows, if the product `(upper_level - lower_level) * (num_levels - 1)` exceeds the number representable by an `uint64_t`, the cuda error `cudaErrorInvalidValue` is returned. If the common type is 128 bits wide, bin computation will use 128-bit arithmetic and `cudaErrorInvalidValue` will only be returned if bin computation would overflow for 128-bit arithmetic.
+The ranges `[d_samples, d_samples + num_samples)` and `[d_histogram, d_histogram + num_levels - 1)` shall not overlap in any way.
+`cuda::std::common_type` must be valid, and both LevelT and SampleT must be valid arithmetic types. The common type must be convertible to `int` and trivially copyable.
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access input iterator type for reading input samples (may be a simple pointer type)
+
+
+
+**[inferred]** Integer type for histogram bin counters
+
+
+
+**[inferred]** Type for specifying boundaries (levels)
+
+
+
+**[inferred]** Signed integer type for sequence offsets, list lengths, pointer differences, etc. (Consider using 32-bit values as offsets/lengths/etc. For example, `int` will typically yield better performance than `size_t` in 64-bit memory mode.)
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+The pointer to the input sequence of data samples.
+
+
+
+The pointer to the histogram counter output array of length `num_levels - 1`.
+
+
+
+The number of boundaries (levels) for delineating histogram samples. Implies that the number of bins is `num_levels - 1`.
+
+
+
+The lower sample value bound (inclusive) for the lowest histogram bin.
+
+
+
+The upper sample value bound (exclusive) for the highest histogram bin.
+
+
+
+The number of input samples (i.e., the length of `d_samples`)
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+**Example**
+
+The code snippet below illustrates the computation of a six-bin histogram from a sequence of float samples
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+// Declare, allocate, and initialize device-accessible pointers for
+// input samples and output histogram
+int num_samples; // e.g., 10
+float* d_samples; // e.g., [2.2, 6.1, 7.1, 2.9, 3.5, 0.3, 2.9, 2.1, 6.1, 999.5]
+int* d_histogram; // e.g., [ -, -, -, -, -, -]
+int num_levels; // e.g., 7 (seven level boundaries for six bins)
+float lower_level; // e.g., 0.0 (lower sample value boundary of lowest bin)
+float upper_level; // e.g., 12.0 (upper sample value boundary of upper bin)
+...
+
+// Determine temporary device storage requirements
+void* d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+cub::DeviceHistogram::HistogramEven(
+ d_temp_storage, temp_storage_bytes,
+ d_samples, d_histogram, num_levels,
+ lower_level, upper_level, num_samples);
+
+// Allocate temporary storage
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Compute histograms
+cub::DeviceHistogram::HistogramEven(
+ d_temp_storage, temp_storage_bytes,
+ d_samples, d_histogram, num_levels,
+ lower_level, upper_level, num_samples);
+
+// d_histogram <-- [1, 5, 0, 3, 0, 0];
+```
+
+
+
+
+Computes an intensity histogram from a sequence of data samples using equal-width bins.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceHistogram::HistogramEven(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ SampleIteratorT d_samples,
+ CounterT *d_histogram,
+ int num_levels,
+ LevelT lower_level,
+ LevelT upper_level,
+ OffsetT num_row_samples,
+ OffsetT num_rows,
+ size_t row_stride_bytes,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+A two-dimensional *region of interest* within `d_samples` can be specified using the `num_row_samples`, `num_rows`, and `row_stride_bytes` parameters.
+The row stride must be a whole multiple of the sample data type size, i.e., `(row_stride_bytes % sizeof(SampleT)) == 0`.
+The number of histogram bins is (`num_levels - 1`)
+All bins comprise the same width of sample values: `(upper_level - lower_level) / (num_levels - 1)`
+If the common type of `SampleT` and `LevelT` is of integral type, the bin for a sample is computed as `(sample - lower_level) * (num_levels - 1) / (upper_level - lower_level)`, round down to the nearest whole number. To protect against potential overflows, if the product `(upper_level - lower_level) * (num_levels - 1)` exceeds the number representable by an `uint64_t`, the cuda error `cudaErrorInvalidValue` is returned. If the common type is 128 bits wide, bin computation will use 128-bit arithmetic and `cudaErrorInvalidValue` will only be returned if bin computation would overflow for 128-bit arithmetic.
+For a given row `r` in `[0, num_rows)`, let `row_begin = d_samples + r * row_stride_bytes / sizeof(SampleT)` and `row_end = row_begin + num_row_samples`. The ranges `[row_begin, row_end)` and `[d_histogram, d_histogram + num_levels - 1)` shall not overlap in any way.
+`cuda::std::common_type` must be valid, and both LevelT and SampleT must be valid arithmetic types. The common type must be convertible to `int` and trivially copyable.
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access input iterator type for reading input samples. (may be a simple pointer type)
+
+
+
+**[inferred]** Integer type for histogram bin counters
+
+
+
+**[inferred]** Type for specifying boundaries (levels)
+
+
+
+**[inferred]** Signed integer type for sequence offsets, list lengths, pointer differences, etc. (Consider using 32-bit values as offsets/lengths/etc. For example, `int` will typically yield better performance than `size_t` in 64-bit memory mode.)
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+The pointer to the input sequence of data samples.
+
+
+
+The pointer to the histogram counter output array of length `num_levels - 1`.
+
+
+
+The number of boundaries (levels) for delineating histogram samples. Implies that the number of bins is `num_levels - 1`.
+
+
+
+The lower sample value bound (inclusive) for the lowest histogram bin.
+
+
+
+The upper sample value bound (exclusive) for the highest histogram bin.
+
+
+
+The number of data samples per row in the region of interest
+
+
+
+The number of rows in the region of interest
+
+
+
+The number of bytes between starts of consecutive rows in the region of interest
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+**Example**
+
+The code snippet below illustrates the computation of a six-bin histogram from a 2x5 region of interest within a flattened 2x7 array of float samples.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+// Declare, allocate, and initialize device-accessible pointers for
+// input samples and output histogram
+int num_row_samples; // e.g., 5
+int num_rows; // e.g., 2;
+size_t row_stride_bytes; // e.g., 7 * sizeof(float)
+float* d_samples; // e.g., [2.2, 6.1, 7.1, 2.9, 3.5, -, -,
+ // 0.3, 2.9, 2.1, 6.1, 999.5, -, -]
+int* d_histogram; // e.g., [ -, -, -, -, -, -]
+int num_levels; // e.g., 7 (seven level boundaries for six bins)
+float lower_level; // e.g., 0.0 (lower sample value boundary of lowest bin)
+float upper_level; // e.g., 12.0 (upper sample value boundary of upper bin)
+...
+
+// Determine temporary device storage requirements
+void* d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+cub::DeviceHistogram::HistogramEven(
+ d_temp_storage, temp_storage_bytes,
+ d_samples, d_histogram, num_levels, lower_level, upper_level,
+ num_row_samples, num_rows, row_stride_bytes);
+
+// Allocate temporary storage
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Compute histograms
+cub::DeviceHistogram::HistogramEven(
+ d_temp_storage, temp_storage_bytes, d_samples, d_histogram,
+ d_samples, d_histogram, num_levels, lower_level, upper_level,
+ num_row_samples, num_rows, row_stride_bytes);
+
+// d_histogram <-- [1, 5, 0, 3, 0, 0];
+```
+
+
+
+
+### MultiHistogramEven inline static
+
+
+
+
+Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using equal-width bins.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceHistogram::MultiHistogramEven(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ SampleIteratorT d_samples,
+ ::cuda::std::array d_histogram,
+ ::cuda::std::array num_levels,
+ ::cuda::std::array lower_level,
+ ::cuda::std::array upper_level,
+ OffsetT num_pixels,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The input is a sequence of *pixel* structures, where each pixel comprises a record of `NUM_CHANNELS` consecutive data samples (e.g., an *RGBA* pixel).
+`NUM_CHANNELS` can be up to 4.
+Of the `NUM_CHANNELS` specified, the function will only compute histograms for the first `NUM_ACTIVE_CHANNELS` (e.g., only *RGB* histograms from *RGBA* pixel samples).
+The number of histogram bins for channeli is `num_levels[i] - 1`.
+For channeli, the range of values for all histogram bins have the same width: `(upper_level[i] - lower_level[i]) / (num_levels[i] - 1)`
+If the common type of sample and level is of integral type, the bin for a sample is computed as `(sample - lower_level[i]) * (num_levels - 1) / (upper_level[i] - lower_level[i])`, round down to the nearest whole number. To protect against potential overflows, if, for any channel `i`, the product `(upper_level[i] - lower_level[i]) * (num_levels[i] - 1)` exceeds the number representable by an `uint64_t`, the cuda error `cudaErrorInvalidValue` is returned. If the common type is 128 bits wide, bin computation will use 128-bit arithmetic and `cudaErrorInvalidValue` will only be returned if bin computation would overflow for 128-bit arithmetic.
+For a given channel `c` in `[0, NUM_ACTIVE_CHANNELS)`, the ranges `[d_samples, d_samples + NUM_CHANNELS * num_pixels)` and `[d_histogram[c], d_histogram[c] + num_levels[c] - 1)` shall not overlap in any way.
+`cuda::std::common_type` must be valid, and both LevelT and SampleT must be valid arithmetic types. The common type must be convertible to `int` and trivially copyable.
+@devicestorage
+
+
+**Template parameters**
+
+
+Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
+
+
+
+**[inferred]** Number of channels actively being histogrammed
+
+
+
+**[inferred]** Random-access input iterator type for reading input samples. (may be a simple pointer type)
+
+
+
+**[inferred]** Integer type for histogram bin counters
+
+
+
+**[inferred]** Type for specifying boundaries (levels)
+
+
+
+**[inferred]** Signed integer type for sequence offsets, list lengths, pointer differences, etc. (Consider using 32-bit values as offsets/lengths/etc. For example, `int` will typically yield better performance than `size_t` in 64-bit memory mode.)
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four *RGBA* 8-bit samples).
+
+
+
+Embed:rst:leading-asterisk
+//! The pointers to the histogram counter output arrays, one for each active
+//! channel. For channel\ :sub:`i`, the allocation length of
+//! ``d_histogram[i]`` should be `num_levels[i] - 1``.
+//!
+
+
+
+Embed:rst:leading-asterisk
+//! The number of boundaries (levels) for delineating histogram samples in each active channel.
+//! Implies that the number of bins for channel\ :sub:`i` is ``num_levels[i] - 1``.
+//!
+
+
+
+The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
+
+
+
+The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
+
+
+
+The number of multi-channel pixels (i.e., the length of `d_samples / NUM_CHANNELS`)
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+**Example**
+
+The code snippet below illustrates the computation of three 256-bin *RGB* histograms from a quad-channel sequence of *RGBA* pixels (8 bits per channel per pixel)
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+// Declare, allocate, and initialize device-accessible pointers for
+// input samples and output histograms
+int num_pixels; // e.g., 5
+unsigned char* d_samples; // e.g., [(2, 6, 7, 5), (3, 0, 2, 1), (7, 0, 6, 2),
+ // (0, 6, 7, 5), (3, 0, 2, 6)]
+int* d_histogram[3]; // e.g., three device pointers to three device buffers,
+ // each allocated with 256 integer counters
+int num_levels[3]; // e.g., {257, 257, 257};
+unsigned int lower_level[3]; // e.g., {0, 0, 0};
+unsigned int upper_level[3]; // e.g., {256, 256, 256};
+...
+
+// Determine temporary device storage requirements
+void* d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+cub::DeviceHistogram::MultiHistogramEven<4, 3>(
+ d_temp_storage, temp_storage_bytes,
+ d_samples, d_histogram, num_levels,
+ lower_level, upper_level, num_pixels);
+
+// Allocate temporary storage
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Compute histograms
+cub::DeviceHistogram::MultiHistogramEven<4, 3>(
+ d_temp_storage, temp_storage_bytes,
+ d_samples, d_histogram, num_levels,
+ lower_level, upper_level, num_pixels);
+
+// d_histogram <-- [ [1, 0, 1, 2, 0, 0, 0, 1, 0, 0, 0, ..., 0],
+// [0, 3, 0, 0, 0, 0, 2, 0, 0, 0, 0, ..., 0],
+// [0, 0, 2, 0, 0, 0, 1, 2, 0, 0, 0, ..., 0] ]
+```
+
+
+
+
+Deprecate [Since 3.0].
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceHistogram::MultiHistogramEven(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ SampleIteratorT d_samples,
+ CounterT *d_histogram[NUM_ACTIVE_CHANNELS],
+ const int num_levels[NUM_ACTIVE_CHANNELS],
+ const LevelT lower_level[NUM_ACTIVE_CHANNELS],
+ const LevelT upper_level[NUM_ACTIVE_CHANNELS],
+ OffsetT num_pixels,
+ cudaStream_t stream = 0
+)
+```
+
+
+
+
+
+Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using equal-width bins.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceHistogram::MultiHistogramEven(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ SampleIteratorT d_samples,
+ ::cuda::std::array d_histogram,
+ ::cuda::std::array num_levels,
+ ::cuda::std::array lower_level,
+ ::cuda::std::array upper_level,
+ OffsetT num_row_pixels,
+ OffsetT num_rows,
+ size_t row_stride_bytes,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The input is a sequence of *pixel* structures, where each pixel comprises a record of `NUM_CHANNELS` consecutive data samples (e.g., an *RGBA* pixel).
+`NUM_CHANNELS` can be up to 4.
+Of the `NUM_CHANNELS` specified, the function will only compute histograms for the first `NUM_ACTIVE_CHANNELS` (e.g., only *RGB* histograms from *RGBA* pixel samples).
+A two-dimensional *region of interest* within `d_samples` can be specified using the `num_row_samples`, `num_rows`, and `row_stride_bytes` parameters.
+The row stride must be a whole multiple of the sample data type size, i.e., `(row_stride_bytes % sizeof(SampleT)) == 0`.
+The number of histogram bins for channeli is `num_levels[i] - 1`.
+For channeli, the range of values for all histogram bins have the same width: `(upper_level[i] - lower_level[i]) / (num_levels[i] - 1)`
+If the common type of sample and level is of integral type, the bin for a sample is computed as `(sample - lower_level[i]) * (num_levels - 1) / (upper_level[i] - lower_level[i])`, round down to the nearest whole number. To protect against potential overflows, if, for any channel `i`, the product `(upper_level[i] - lower_level[i]) * (num_levels[i] - 1)` exceeds the number representable by an `uint64_t`, the cuda error `cudaErrorInvalidValue` is returned. If the common type is 128 bits wide, bin computation will use 128-bit arithmetic and `cudaErrorInvalidValue` will only be returned if bin computation would overflow for 128-bit arithmetic.
+For a given row `r` in `[0, num_rows)`, and sample `s` in `[0, num_row_pixels)`, let `row_begin = d_samples + r * row_stride_bytes / sizeof(SampleT)`, `sample_begin = row_begin + s * NUM_CHANNELS`, and `sample_end = sample_begin + NUM_ACTIVE_CHANNELS`. For a given channel `c` in `[0, NUM_ACTIVE_CHANNELS)`, the ranges `[sample_begin, sample_end)` and `[d_histogram[c], d_histogram[c] + num_levels[c] - 1)` shall not overlap in any way.
+`cuda::std::common_type` must be valid, and both LevelT and SampleT must be valid arithmetic types. The common type must be convertible to `int` and trivially copyable.
+@devicestorage
+
+
+**Template parameters**
+
+
+Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
+
+
+
+**[inferred]** Number of channels actively being histogrammed
+
+
+
+**[inferred]** Random-access input iterator type for reading input samples. (may be a simple pointer type)
+
+
+
+**[inferred]** Integer type for histogram bin counters
+
+
+
+**[inferred]** Type for specifying boundaries (levels)
+
+
+
+**[inferred]** Signed integer type for sequence offsets, list lengths, pointer differences, etc. (Consider using 32-bit values as offsets/lengths/etc. For example, `int` will typically yield better performance than `size_t` in 64-bit memory mode.)
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four *RGBA* 8-bit samples).
+
+
+
+Embed:rst:leading-asterisk
+//! The pointers to the histogram counter output arrays, one for each
+//! active channel. For channel\ :sub:`i`, the allocation length
+//! of ``d_histogram[i]`` should be ``num_levels[i] - 1``.
+//!
+
+
+
+Embed:rst:leading-asterisk
+//! The number of boundaries (levels) for delineating histogram samples in each active channel.
+//! Implies that the number of bins for channel\ :sub:`i` is ``num_levels[i] - 1``.
+//!
+
+
+
+The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
+
+
+
+The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
+
+
+
+The number of multi-channel pixels per row in the region of interest
+
+
+
+The number of rows in the region of interest
+
+
+
+The number of bytes between starts of consecutive rows in the region of interest
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+**Example**
+
+The code snippet below illustrates the computation of three 256-bin *RGB* histograms from a 2x3 region of interest of within a flattened 2x4 array of quad-channel *RGBA* pixels (8 bits per channel per pixel).
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+// Declare, allocate, and initialize device-accessible pointers for input
+// samples and output histograms
+int num_row_pixels; // e.g., 3
+int num_rows; // e.g., 2
+size_t row_stride_bytes; // e.g., 4 * sizeof(unsigned char) * NUM_CHANNELS
+unsigned char* d_samples; // e.g., [(2, 6, 7, 5), (3, 0, 2, 1), (7, 0, 6, 2), (-, -, -, -),
+ // (0, 6, 7, 5), (3, 0, 2, 6), (1, 1, 1, 1), (-, -, -, -)]
+int* d_histogram[3]; // e.g., three device pointers to three device buffers,
+ // each allocated with 256 integer counters
+int num_levels[3]; // e.g., {257, 257, 257};
+unsigned int lower_level[3]; // e.g., {0, 0, 0};
+unsigned int upper_level[3]; // e.g., {256, 256, 256};
+...
+
+// Determine temporary device storage requirements
+void* d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+cub::DeviceHistogram::MultiHistogramEven<4, 3>(
+ d_temp_storage, temp_storage_bytes,
+ d_samples, d_histogram, num_levels, lower_level, upper_level,
+ num_row_pixels, num_rows, row_stride_bytes);
+
+// Allocate temporary storage
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Compute histograms
+cub::DeviceHistogram::MultiHistogramEven<4, 3>(
+ d_temp_storage, temp_storage_bytes,
+ d_samples, d_histogram, num_levels, lower_level, upper_level,
+ num_row_pixels, num_rows, row_stride_bytes);
+
+// d_histogram <-- [ [1, 1, 1, 2, 0, 0, 0, 1, 0, 0, 0, ..., 0],
+// [0, 4, 0, 0, 0, 0, 2, 0, 0, 0, 0, ..., 0],
+// [0, 1, 2, 0, 0, 0, 1, 2, 0, 0, 0, ..., 0] ]
+```
+
+
+
+
+Deprecate [Since 3.0].
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceHistogram::MultiHistogramEven(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ SampleIteratorT d_samples,
+ CounterT *d_histogram[NUM_ACTIVE_CHANNELS],
+ const int num_levels[NUM_ACTIVE_CHANNELS],
+ const LevelT lower_level[NUM_ACTIVE_CHANNELS],
+ const LevelT upper_level[NUM_ACTIVE_CHANNELS],
+ OffsetT num_row_pixels,
+ OffsetT num_rows,
+ size_t row_stride_bytes,
+ cudaStream_t stream = 0
+)
+```
+
+
+
+
+
+### to_array inline static
+
+
+```cpp showLineNumbers={false}
+template
+static auto cub::DeviceHistogram::to_array(
+ T *ptr
+)
+```
+
+
+---
+
+## Custom bin ranges
+
+### HistogramRange inline static
+
+
+
+
+Computes an intensity histogram from a sequence of data samples using the specified bin boundary levels.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceHistogram::HistogramRange(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ SampleIteratorT d_samples,
+ CounterT *d_histogram,
+ int num_levels,
+ const LevelT *d_levels,
+ OffsetT num_samples,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The number of histogram bins is (`num_levels - 1`)
+The value range for bini is `[level[i], level[i+1])`
+The range `[d_histogram, d_histogram + num_levels - 1)` shall not overlap `[d_samples, d_samples + num_samples)` nor `[d_levels, d_levels + num_levels)` in any way. The ranges `[d_levels, d_levels + num_levels)` and `[d_samples, d_samples + num_samples)` may overlap.
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access input iterator type for reading input samples. (may be a simple pointer type)
+
+
+
+**[inferred]** Integer type for histogram bin counters
+
+
+
+**[inferred]** Type for specifying boundaries (levels)
+
+
+
+**[inferred]** Signed integer type for sequence offsets, list lengths, pointer differences, etc. (Consider using 32-bit values as offsets/lengths/etc. For example, `int` will typically yield better performance than `size_t` in 64-bit memory mode.)
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+The pointer to the input sequence of data samples.
+
+
+
+The pointer to the histogram counter output array of length `num_levels - 1`.
+
+
+
+The number of boundaries (levels) for delineating histogram samples. Implies that the number of bins is `num_levels - 1`.
+
+
+
+The pointer to the array of boundaries (levels). Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
+
+
+
+The number of data samples per row in the region of interest
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+**Example**
+
+The code snippet below illustrates the computation of an six-bin histogram from a sequence of float samples
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+// Declare, allocate, and initialize device-accessible pointers for input
+// samples and output histogram
+int num_samples; // e.g., 10
+float* d_samples; // e.g., [2.2, 6.0, 7.1, 2.9, 3.5, 0.3, 2.9, 2.0, 6.1, 999.5]
+int* d_histogram; // e.g., [ -, -, -, -, -, -]
+int num_levels // e.g., 7 (seven level boundaries for six bins)
+float* d_levels; // e.g., [0.0, 2.0, 4.0, 6.0, 8.0, 12.0, 16.0]
+...
+
+// Determine temporary device storage requirements
+void* d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+cub::DeviceHistogram::HistogramRange(
+ d_temp_storage, temp_storage_bytes,
+ d_samples, d_histogram, num_levels, d_levels, num_samples);
+
+// Allocate temporary storage
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Compute histograms
+cub::DeviceHistogram::HistogramRange(
+ d_temp_storage, temp_storage_bytes,
+ d_samples, d_histogram, num_levels, d_levels, num_samples);
+
+// d_histogram <-- [1, 5, 0, 3, 0, 0];
+```
+
+
+
+
+Computes an intensity histogram from a sequence of data samples using the specified bin boundary levels.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceHistogram::HistogramRange(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ SampleIteratorT d_samples,
+ CounterT *d_histogram,
+ int num_levels,
+ const LevelT *d_levels,
+ OffsetT num_row_samples,
+ OffsetT num_rows,
+ size_t row_stride_bytes,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+A two-dimensional *region of interest* within `d_samples` can be specified using the `num_row_samples`, `num_rows`, and `row_stride_bytes` parameters.
+The row stride must be a whole multiple of the sample data type size, i.e., `(row_stride_bytes % sizeof(SampleT)) == 0`.
+The number of histogram bins is (`num_levels - 1`)
+The value range for bini is `[level[i], level[i+1])`
+For a given row `r` in `[0, num_rows)`, let `row_begin = d_samples + r * row_stride_bytes / sizeof(SampleT)` and `row_end = row_begin + num_row_samples`. The range `[d_histogram, d_histogram + num_levels - 1)` shall not overlap `[row_begin, row_end)` nor `[d_levels, d_levels + num_levels)`. The ranges `[d_levels, d_levels + num_levels)` and `[row_begin, row_end)` may overlap.
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access input iterator type for reading input samples. (may be a simple pointer type)
+
+
+
+**[inferred]** Integer type for histogram bin counters
+
+
+
+**[inferred]** Type for specifying boundaries (levels)
+
+
+
+**[inferred]** Signed integer type for sequence offsets, list lengths, pointer differences, etc. (Consider using 32-bit values as offsets/lengths/etc. For example, `int` will typically yield better performance than `size_t` in 64-bit memory mode.)
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+The pointer to the input sequence of data samples.
+
+
+
+The pointer to the histogram counter output array of length `num_levels - 1`.
+
+
+
+The number of boundaries (levels) for delineating histogram samples. Implies that the number of bins is `num_levels - 1`.
+
+
+
+The pointer to the array of boundaries (levels). Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
+
+
+
+The number of data samples per row in the region of interest
+
+
+
+The number of rows in the region of interest
+
+
+
+The number of bytes between starts of consecutive rows in the region of interest
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+**Example**
+
+The code snippet below illustrates the computation of a six-bin histogram from a 2x5 region of interest within a flattened 2x7 array of float samples.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+// Declare, allocate, and initialize device-accessible pointers for input samples and
+// output histogram
+int num_row_samples; // e.g., 5
+int num_rows; // e.g., 2;
+int row_stride_bytes; // e.g., 7 * sizeof(float)
+float* d_samples; // e.g., [2.2, 6.0, 7.1, 2.9, 3.5, -, -,
+ // 0.3, 2.9, 2.0, 6.1, 999.5, -, -]
+int* d_histogram; // e.g., [ -, -, -, -, -, -]
+int num_levels // e.g., 7 (seven level boundaries for six bins)
+float *d_levels; // e.g., [0.0, 2.0, 4.0, 6.0, 8.0, 12.0, 16.0]
+...
+
+// Determine temporary device storage requirements
+void* d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+cub::DeviceHistogram::HistogramRange(
+ d_temp_storage, temp_storage_bytes,
+ d_samples, d_histogram, num_levels, d_levels,
+ num_row_samples, num_rows, row_stride_bytes);
+
+// Allocate temporary storage
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Compute histograms
+cub::DeviceHistogram::HistogramRange(
+ d_temp_storage, temp_storage_bytes,
+ d_samples, d_histogram, num_levels, d_levels,
+ num_row_samples, num_rows, row_stride_bytes);
+
+// d_histogram <-- [1, 5, 0, 3, 0, 0];
+```
+
+
+
+
+### MultiHistogramRange inline static
+
+
+
+
+Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using the specified bin boundary levels.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceHistogram::MultiHistogramRange(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ SampleIteratorT d_samples,
+ ::cuda::std::array d_histogram,
+ ::cuda::std::array num_levels,
+ ::cuda::std::array d_levels,
+ OffsetT num_pixels,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The input is a sequence of *pixel* structures, where each pixel comprises a record of `NUM_CHANNELS` consecutive data samples (e.g., an *RGBA* pixel).
+`NUM_CHANNELS` can be up to 4.
+Of the `NUM_CHANNELS` specified, the function will only compute histograms for the first `NUM_ACTIVE_CHANNELS` (e.g., *RGB* histograms from *RGBA* pixel samples).
+The number of histogram bins for channeli is `num_levels[i] - 1`.
+For channeli, the range of values for all histogram bins have the same width: `(upper_level[i] - lower_level[i]) / (num_levels[i] - 1)`
+For given channels `c1` and `c2` in `[0, NUM_ACTIVE_CHANNELS)`, the range `[d_histogram[c1], d_histogram[c1] + num_levels[c1] - 1)` shall not overlap `[d_samples, d_samples + NUM_CHANNELS * num_pixels)` nor `[d_levels[c2], d_levels[c2] + num_levels[c2])` in any way. The ranges `[d_levels[c2], d_levels[c2] + num_levels[c2])` and `[d_samples, d_samples + NUM_CHANNELS * num_pixels)` may overlap.
+@devicestorage
+
+
+**Template parameters**
+
+
+Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
+
+
+
+**[inferred]** Number of channels actively being histogrammed
+
+
+
+**[inferred]** Random-access input iterator type for reading input samples. (may be a simple pointer type)
+
+
+
+**[inferred]** Integer type for histogram bin counters
+
+
+
+**[inferred]** Type for specifying boundaries (levels)
+
+
+
+**[inferred]** Signed integer type for sequence offsets, list lengths, pointer differences, etc. (Consider using 32-bit values as offsets/lengths/etc. For example, `int` will typically yield better performance than `size_t` in 64-bit memory mode.)
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four *RGBA* 8-bit samples).
+
+
+
+Embed:rst:leading-asterisk
+//! The pointers to the histogram counter output arrays, one for each active
+//! channel. For channel\ :sub:`i`, the allocation length of
+//! ``d_histogram[i]`` should be ``num_levels[i] - 1``.
+//!
+
+
+
+Embed:rst:leading-asterisk
+//! The number of boundaries (levels) for delineating histogram samples in
+//! each active channel. Implies that the number of bins for
+//! channel\ :sub:`i` is ``num_levels[i] - 1``.
+//!
+
+
+
+The pointers to the arrays of boundaries (levels), one for each active channel. Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
+
+
+
+The number of multi-channel pixels (i.e., the length of `d_samples / NUM_CHANNELS`)
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+**Example**
+
+The code snippet below illustrates the computation of three 4-bin *RGB* histograms from a quad-channel sequence of *RGBA* pixels (8 bits per channel per pixel)
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+// Declare, allocate, and initialize device-accessible pointers for
+// input samples and output histograms
+int num_pixels; // e.g., 5
+unsigned char *d_samples; // e.g., [(2, 6, 7, 5),(3, 0, 2, 1),(7, 0, 6, 2),
+ // (0, 6, 7, 5),(3, 0, 2, 6)]
+unsigned int *d_histogram[3]; // e.g., [[ -, -, -, -],[ -, -, -, -],[ -, -, -, -]];
+int num_levels[3]; // e.g., {5, 5, 5};
+unsigned int *d_levels[3]; // e.g., [ [0, 2, 4, 6, 8],
+ // [0, 2, 4, 6, 8],
+ // [0, 2, 4, 6, 8] ];
+...
+
+// Determine temporary device storage requirements
+void* d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+cub::DeviceHistogram::MultiHistogramRange<4, 3>(
+ d_temp_storage, temp_storage_bytes,
+ d_samples, d_histogram, num_levels, d_levels, num_pixels);
+
+// Allocate temporary storage
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Compute histograms
+cub::DeviceHistogram::MultiHistogramRange<4, 3>(
+ d_temp_storage, temp_storage_bytes,
+ d_samples, d_histogram, num_levels, d_levels, num_pixels);
+
+// d_histogram <-- [ [1, 3, 0, 1],
+// [3, 0, 0, 2],
+// [0, 2, 0, 3] ]
+```
+
+
+
+
+Deprecate [Since 3.0].
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceHistogram::MultiHistogramRange(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ SampleIteratorT d_samples,
+ CounterT *d_histogram[NUM_ACTIVE_CHANNELS],
+ const int num_levels[NUM_ACTIVE_CHANNELS],
+ const LevelT *const d_levels[NUM_ACTIVE_CHANNELS],
+ OffsetT num_pixels,
+ cudaStream_t stream = 0
+)
+```
+
+
+
+
+
+Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using the specified bin boundary levels.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceHistogram::MultiHistogramRange(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ SampleIteratorT d_samples,
+ ::cuda::std::array d_histogram,
+ ::cuda::std::array num_levels,
+ ::cuda::std::array d_levels,
+ OffsetT num_row_pixels,
+ OffsetT num_rows,
+ size_t row_stride_bytes,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The input is a sequence of *pixel* structures, where each pixel comprises a record of `NUM_CHANNELS` consecutive data samples (e.g., an *RGBA* pixel).
+`NUM_CHANNELS` can be up to 4.
+Of the `NUM_CHANNELS` specified, the function will only compute histograms for the first `NUM_ACTIVE_CHANNELS` (e.g., *RGB* histograms from *RGBA* pixel samples).
+A two-dimensional *region of interest* within `d_samples` can be specified using the `num_row_samples`, `num_rows`, and `row_stride_bytes` parameters.
+The row stride must be a whole multiple of the sample data type size, i.e., `(row_stride_bytes % sizeof(SampleT)) == 0`.
+The number of histogram bins for channeli is `num_levels[i] - 1`.
+For channeli, the range of values for all histogram bins have the same width: `(upper_level[i] - lower_level[i]) / (num_levels[i] - 1)`
+For a given row `r` in `[0, num_rows)`, and sample `s` in `[0, num_row_pixels)`, let `row_begin = d_samples + r * row_stride_bytes / sizeof(SampleT)`, `sample_begin = row_begin + s * NUM_CHANNELS`, and `sample_end = sample_begin + NUM_ACTIVE_CHANNELS`. For given channels `c1` and `c2` in `[0, NUM_ACTIVE_CHANNELS)`, the range `[d_histogram[c1], d_histogram[c1] + num_levels[c1] - 1)` shall not overlap `[sample_begin, sample_end)` nor `[d_levels[c2], d_levels[c2] + num_levels[c2])` in any way. The ranges `[d_levels[c2], d_levels[c2] + num_levels[c2])` and `[sample_begin, sample_end)` may overlap.
+@devicestorage
+
+
+**Template parameters**
+
+
+Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
+
+
+
+**[inferred]** Number of channels actively being histogrammed
+
+
+
+**[inferred]** Random-access input iterator type for reading input samples. (may be a simple pointer type)
+
+
+
+**[inferred]** Integer type for histogram bin counters
+
+
+
+**[inferred]** Type for specifying boundaries (levels)
+
+
+
+**[inferred]** Signed integer type for sequence offsets, list lengths, pointer differences, etc. (Consider using 32-bit values as offsets/lengths/etc. For example, `int` will typically yield better performance than `size_t` in 64-bit memory mode.)
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four *RGBA* 8-bit samples).
+
+
+
+Embed:rst:leading-asterisk
+//! The pointers to the histogram counter output arrays, one for each active
+//! channel. For channel\ :sub:`i`, the allocation length of
+//! ``d_histogram[i]`` should be ``num_levels[i] - 1``.
+//!
+
+
+
+Embed:rst:leading-asterisk
+//! The number of boundaries (levels) for delineating histogram samples in
+//! each active channel. Implies that the number of bins for
+//! channel\ :sub:`i` is ``num_levels[i] - 1``.
+//!
+
+
+
+The pointers to the arrays of boundaries (levels), one for each active channel. Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
+
+
+
+The number of multi-channel pixels per row in the region of interest
+
+
+
+The number of rows in the region of interest
+
+
+
+The number of bytes between starts of consecutive rows in the region of interest
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+**Example**
+
+The code snippet below illustrates the computation of three 4-bin *RGB* histograms from a 2x3 region of interest of within a flattened 2x4 array of quad-channel *RGBA* pixels (8 bits per channel per pixel).
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+// Declare, allocate, and initialize device-accessible pointers for input
+// samples and output histograms
+int num_row_pixels; // e.g., 3
+int num_rows; // e.g., 2
+size_t row_stride_bytes; // e.g., 4 * sizeof(unsigned char) * NUM_CHANNELS
+unsigned char* d_samples; // e.g., [(2, 6, 7, 5),(3, 0, 2, 1),(1, 1, 1, 1),(-, -, -, -),
+ // (7, 0, 6, 2),(0, 6, 7, 5),(3, 0, 2, 6),(-, -, -, -)]
+int* d_histogram[3]; // e.g., [[ -, -, -, -],[ -, -, -, -],[ -, -, -, -]];
+int num_levels[3]; // e.g., {5, 5, 5};
+unsigned int* d_levels[3]; // e.g., [ [0, 2, 4, 6, 8],
+ // [0, 2, 4, 6, 8],
+ // [0, 2, 4, 6, 8] ];
+...
+
+// Determine temporary device storage requirements
+void* d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+cub::DeviceHistogram::MultiHistogramRange<4, 3>(
+ d_temp_storage, temp_storage_bytes,
+ d_samples, d_histogram, num_levels, d_levels,
+ num_row_pixels, num_rows, row_stride_bytes);
+
+// Allocate temporary storage
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Compute histograms
+cub::DeviceHistogram::MultiHistogramRange<4, 3>(
+ d_temp_storage, temp_storage_bytes,
+ d_samples, d_histogram, num_levels,
+ d_levels, num_row_pixels, num_rows, row_stride_bytes);
+
+// d_histogram <-- [ [2, 3, 0, 1],
+// [3, 0, 0, 2],
+// [1, 2, 0, 3] ]
+```
+
+
+
+
+Deprecate [Since 3.0].
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceHistogram::MultiHistogramRange(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ SampleIteratorT d_samples,
+ CounterT *d_histogram[NUM_ACTIVE_CHANNELS],
+ const int num_levels[NUM_ACTIVE_CHANNELS],
+ const LevelT *const d_levels[NUM_ACTIVE_CHANNELS],
+ OffsetT num_row_pixels,
+ OffsetT num_rows,
+ size_t row_stride_bytes,
+ cudaStream_t stream = 0
+)
+```
+
+
+
+
diff --git a/fern/cudapages/cub/cub/cub/DeviceMemcpy.mdx b/fern/cudapages/cub/cub/cub/DeviceMemcpy.mdx
new file mode 100644
index 0000000..58db367
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/DeviceMemcpy.mdx
@@ -0,0 +1,146 @@
+---
+title: cub::DeviceMemcpy
+description: "[cub::DeviceMemcpy](/library/api/cub::_device_memcpy) provides device-wide, parallel operations for copying data."
+---
+
+`cub::DeviceMemcpy` provides device-wide, parallel operations for copying data.
+
+---
+
+## Static methods
+
+### Batched inline static
+
+Copies data from a batch of given source buffers to their corresponding destination buffer.
+
+.. note::
+
+If any input buffer aliases memory from any output buffer the behavior is undefined. If any output buffer aliases memory of another output buffer the behavior is undefined. Input buffers can alias one another.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceMemcpy::Batched(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ InputBufferIt input_buffer_it,
+ OutputBufferIt output_buffer_it,
+ BufferSizeIteratorT buffer_sizes,
+ ::cuda::std::int64_t num_buffers,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+**Template parameters**
+
+
+**[inferred]** Device-accessible random-access input iterator type providing the pointers to the source memory buffers
+
+
+
+**[inferred]** Device-accessible random-access input iterator type providing the pointers to the destination memory buffers
+
+
+
+**[inferred]** Device-accessible random-access input iterator type providing the number of bytes to be copied for each pair of buffers
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Device-accessible iterator providing the pointers to the source memory buffers
+
+
+
+Device-accessible iterator providing the pointers to the destination memory buffers
+
+
+
+Device-accessible iterator providing the number of bytes to be copied for each pair of buffers
+
+
+
+The total number of buffer pairs
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+**Example**
+
+The code snippet below illustrates usage of DeviceMemcpy::Batched for mutating strings withing a single string buffer.
+
+```cpp showLineNumbers={false}
+struct GetPtrToStringItem
+{
+ __host__ __device__ __forceinline__ void *operator()(uint32_t index)
+ {
+ return &d_string_data_in[d_string_offsets[index]];
+ }
+ char *d_string_data_in;
+ uint32_t *d_string_offsets;
+};
+
+struct GetStringItemSize
+{
+ __host__ __device__ __forceinline__ uint32_t operator()(uint32_t index)
+ {
+ return d_string_offsets[index + 1] - d_string_offsets[index];
+ }
+ uint32_t *d_string_offsets;
+};
+
+uint32_t num_strings = 5;
+char *d_string_data_in; // e.g., "TomatoesBananasApplesOrangesGrapes"
+char *d_string_data_out; // e.g., " ... "
+uint32_t *d_string_offsets_old; // e.g., [0, 8, 15, 21, 28, 34]
+uint32_t *d_string_offsets_new; // e.g., [0, 6, 13, 19, 26, 34]
+uint32_t *d_gather_index; // e.g., [2, 1, 4, 3, 0]
+
+// Initialize an iterator that returns d_gather_index[i] when the i-th item is dereferenced
+auto gather_iterator = thrust::make_permutation_iterator(thrust::make_counting_iterator(0),
+d_gather_index);
+
+// Returns pointers to the input buffer for each string
+auto str_ptrs_in = thrust::make_transform_iterator(gather_iterator,
+ GetPtrToStringItem{d_string_data_in,
+d_string_offsets_old});
+
+// Returns the string size of the i-th string
+auto str_sizes = thrust::make_transform_iterator(gather_iterator,
+GetStringItemSize{d_string_offsets_old});
+
+// Returns pointers to the output buffer for each string
+auto str_ptrs_out = thrust::make_transform_iterator(thrust::make_counting_iterator(0),
+ GetPtrToStringItem{d_string_data_out,
+d_string_offsets_new});
+
+// Determine temporary device storage requirements
+void *d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+cub::DeviceMemcpy::Batched(d_temp_storage, temp_storage_bytes, str_ptrs_in, str_ptrs_out,
+str_sizes, num_strings);
+
+// Allocate temporary storage
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Run batched copy algorithm (used to permute strings)
+cub::DeviceMemcpy::Batched(d_temp_storage, temp_storage_bytes, str_ptrs_in, str_ptrs_out,
+str_sizes, num_strings);
+
+// d_string_data_out <-- "ApplesBananasGrapesOrangesTomatoe"
+```
diff --git a/fern/cudapages/cub/cub/cub/DeviceMerge.mdx b/fern/cudapages/cub/cub/cub/DeviceMerge.mdx
new file mode 100644
index 0000000..496401d
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/DeviceMerge.mdx
@@ -0,0 +1,202 @@
+---
+title: cub::DeviceMerge
+description: "[DeviceMerge](/library/api/cub::_device_merge) provides device-wide, parallel operations for merging two sorted sequences of values (called keys) or key-value pairs in device-accessible memory."
+---
+
+`DeviceMerge` provides device-wide, parallel operations for merging two sorted sequences of values (called keys) or key-value pairs in device-accessible memory.
+
+The sorting order is determined by a comparison functor (default: less-than), which has to establish a [strict weak ordering](https://en.cppreference.com/w/cpp/concepts/strict_weak_order).
+
+---
+
+## Static methods
+
+### MergeKeys inline static
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceMerge::MergeKeys(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ KeyIteratorIn1 keys_in1,
+ ::cuda::std::int64_t num_keys1,
+ KeyIteratorIn2 keys_in2,
+ ::cuda::std::int64_t num_keys2,
+ KeyIteratorOut keys_out,
+ CompareOp compare_op = {},
+ cudaStream_t stream = nullptr
+)
+```
+
+
+*Added in v2.7.0. First appears in CUDA Toolkit 12.8.*
+
+**Template parameters**
+
+
+**[deduced]** Random access iterator to the first sorted input sequence. Must have the same value type as KeyIteratorIn2.
+
+
+
+**[deduced]** Random access iterator to the second sorted input sequence. Must have the same value type as KeyIteratorIn1.
+
+
+
+**[deduced]** Random access iterator to the output sequence.
+
+
+
+**[deduced]** Binary predicate to compare the input iterator's value types. Must have a signature equivalent to `bool operator()(Key lhs, Key rhs)` and establish a [strict weak ordering](https://en.cppreference.com/w/cpp/concepts/strict_weak_order).
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation.
+
+
+
+Iterator to the beginning of the first sorted input sequence.
+
+
+
+Number of keys in the first input sequence.
+
+
+
+Iterator to the beginning of the second sorted input sequence.
+
+
+
+Number of keys in the second input sequence.
+
+
+
+Iterator to the beginning of the output sequence.
+
+
+
+Comparison function object, returning true if the first argument is ordered before the second. Must establish a [strict weak ordering](https://en.cppreference.com/w/cpp/concepts/strict_weak_order).
+
+
+
+**[optional]** CUDA stream to launch kernels into. Default is stream0.
+
+
+### MergePairs inline static
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceMerge::MergePairs(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ KeyIteratorIn1 keys_in1,
+ ValueIteratorIn1 values_in1,
+ ::cuda::std::int64_t num_pairs1,
+ KeyIteratorIn2 keys_in2,
+ ValueIteratorIn2 values_in2,
+ ::cuda::std::int64_t num_pairs2,
+ KeyIteratorOut keys_out,
+ ValueIteratorOut values_out,
+ CompareOp compare_op = {},
+ cudaStream_t stream = nullptr
+)
+```
+
+
+*Added in v2.7.0. First appears in CUDA Toolkit 12.8.*
+
+**Template parameters**
+
+
+**[deduced]** Random access iterator to the keys of the first sorted input sequence. Must have the same value type as KeyIteratorIn2.
+
+
+
+**[deduced]** Random access iterator to the values of the first sorted input sequence. Must have the same value type as ValueIteratorIn2.
+
+
+
+**[deduced]** Random access iterator to the second sorted input sequence. Must have the same value type as KeyIteratorIn1.
+
+
+
+**[deduced]** Random access iterator to the values of the second sorted input sequence. Must have the same value type as ValueIteratorIn1.
+
+
+
+**[deduced]** Random access iterator to the keys of the output sequence.
+
+
+
+**[deduced]** Random access iterator to the values of the output sequence.
+
+
+
+**[deduced]** Binary predicate to compare the key input iterator's value types. Must have a signature equivalent to `bool operator()(Key lhs, Key rhs)` and establish a [strict weak ordering](https://en.cppreference.com/w/cpp/concepts/strict_weak_order).
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation.
+
+
+
+Iterator to the beginning of the keys of the first sorted input sequence.
+
+
+
+Iterator to the beginning of the values of the first sorted input sequence.
+
+
+
+Number of key-value pairs in the first input sequence.
+
+
+
+Iterator to the beginning of the keys of the second sorted input sequence.
+
+
+
+Iterator to the beginning of the values of the second sorted input sequence.
+
+
+
+Number of key-value pairs in the second input sequence.
+
+
+
+Iterator to the beginning of the keys of the output sequence.
+
+
+
+Iterator to the beginning of the values of the output sequence.
+
+
+
+Comparison function object, returning true if the first argument is ordered before the second. Must establish a [strict weak ordering](https://en.cppreference.com/w/cpp/concepts/strict_weak_order).
+
+
+
+**[optional]** CUDA stream to launch kernels into. Default is stream0.
+
diff --git a/fern/cudapages/cub/cub/cub/DeviceMergeSort.mdx b/fern/cudapages/cub/cub/cub/DeviceMergeSort.mdx
new file mode 100644
index 0000000..af35af2
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/DeviceMergeSort.mdx
@@ -0,0 +1,633 @@
+---
+title: cub::DeviceMergeSort
+description: "[DeviceMergeSort](/library/api/cub::_device_merge_sort) provides device-wide, parallel operations for computing a merge sort across a sequence of data items residing within device-accessible memory."
+---
+
+`DeviceMergeSort` provides device-wide, parallel operations for computing a merge sort across a sequence of data items residing within device-accessible memory.
+
+**Overview**
+
+- `DeviceMergeSort` arranges items into ascending order using a comparison functor with less-than semantics. Merge sort can handle arbitrary types (as long as a value of these types is a model of [LessThan Comparable](https://en.cppreference.com/w/cpp/named_req/LessThanComparable)) and comparison functors, but is slower than [DeviceRadixSort](/library/api/cub::_device_radix_sort) when sorting arithmetic types into ascending/descending order.
+- Another difference from RadixSort is the fact that `DeviceMergeSort` can handle arbitrary random-access iterators, as shown below.
+
+**A Simple Example**
+
+
+The code snippet below illustrates a thrust reverse iterator usage.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+struct CustomLess
+{
+ template
+ __device__ bool operator()(const DataType &lhs, const DataType &rhs)
+ {
+ return lhs < rhs;
+ }
+};
+
+// Declare, allocate, and initialize device-accessible pointers
+// for sorting data
+thrust::device_vector d_keys(num_items);
+thrust::device_vector d_values(num_items);
+// ...
+
+// Initialize iterator
+using KeyIterator = typename thrust::device_vector::iterator;
+cuda::std::reverse_iterator reverse_iter(d_keys.end());
+
+// Determine temporary device storage requirements
+size_t temp_storage_bytes = 0;
+cub::DeviceMergeSort::SortPairs(
+ nullptr,
+ temp_storage_bytes,
+ reverse_iter,
+ thrust::raw_pointer_cast(d_values.data()),
+ num_items,
+ CustomLess());
+
+// Allocate temporary storage
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Run sorting operation
+cub::DeviceMergeSort::SortPairs(
+ d_temp_storage,
+ temp_storage_bytes,
+ reverse_iter,
+ thrust::raw_pointer_cast(d_values.data()),
+ num_items,
+ CustomLess());
+```
+
+---
+
+## Methods
+
+### GetName inline static constexpr
+
+
+```cpp showLineNumbers={false}
+static constexpr const char * cub::DeviceMergeSort::GetName()
+```
+
+
+### SortPairsNoNVTX inline static
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceMergeSort::SortPairsNoNVTX(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ KeyIteratorT d_keys,
+ ValueIteratorT d_items,
+ OffsetT num_items,
+ CompareOpT compare_op,
+ cudaStream_t stream = 0
+)
+```
+
+
+### SortKeysNoNVTX inline static
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceMergeSort::SortKeysNoNVTX(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ KeyIteratorT d_keys,
+ OffsetT num_items,
+ CompareOpT compare_op,
+ cudaStream_t stream = 0
+)
+```
+
+
+### SortKeysCopyNoNVTX inline static
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceMergeSort::SortKeysCopyNoNVTX(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ KeyInputIteratorT d_input_keys,
+ KeyIteratorT d_output_keys,
+ OffsetT num_items,
+ CompareOpT compare_op,
+ cudaStream_t stream = 0
+)
+```
+
+
+---
+
+## Static methods
+
+### SortPairs inline static
+
+Sorts items using a merge sorting method.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceMergeSort::SortPairs(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ KeyIteratorT d_keys,
+ ValueIteratorT d_items,
+ OffsetT num_items,
+ CompareOpT compare_op,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+**Template parameters**
+
+
+Is a model of [Random Access Iterator](https://en.cppreference.com/w/cpp/iterator/random_access_iterator). `KeyIteratorT` is mutable, and its `value_type` is a model of [LessThan Comparable](https://en.cppreference.com/w/cpp/named_req/LessThanComparable). This `value_type`'s ordering relation is a *strict weak ordering* as defined in the [LessThan Comparable](https://en.cppreference.com/w/cpp/named_req/LessThanComparable) requirements.
+
+
+
+Is a model of [Random Access Iterator](https://en.cppreference.com/w/cpp/iterator/random_access_iterator), and `ValueIteratorT` is mutable.
+
+
+
+Is an integer type for global offsets.
+
+
+
+Is a type of callable object with the signature `bool operator()(KeyT lhs, KeyT rhs)` that models the [Strict Weak Ordering](https://en.cppreference.com/w/cpp/concepts/strict_weak_order) concept.
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Pointer to the input sequence of unsorted input keys
+
+
+
+Pointer to the input sequence of unsorted input values
+
+
+
+Number of items to sort
+
+
+
+Comparison function object which returns true if the first argument is ordered before the second
+
+
+
+**[optional]** CUDA stream to launch kernels within. Default is stream0.
+
+
+### SortPairsCopy inline static
+
+Sorts items using a merge sorting method.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceMergeSort::SortPairsCopy(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ KeyInputIteratorT d_input_keys,
+ ValueInputIteratorT d_input_items,
+ KeyIteratorT d_output_keys,
+ ValueIteratorT d_output_items,
+ OffsetT num_items,
+ CompareOpT compare_op,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+**Template parameters**
+
+
+Is a model of [Random Access Iterator](https://en.cppreference.com/w/cpp/iterator/random_access_iterator). Its `value_type` is a model of [LessThan Comparable](https://en.cppreference.com/w/cpp/named_req/LessThanComparable). This `value_type`'s ordering relation is a *strict weak ordering* as defined in the [LessThan Comparable](https://en.cppreference.com/w/cpp/named_req/LessThanComparable) requirements.
+
+
+
+Is a model of [Random Access Iterator](https://en.cppreference.com/w/cpp/iterator/random_access_iterator).
+
+
+
+Is a model of [Random Access Iterator](https://en.cppreference.com/w/cpp/iterator/random_access_iterator). `KeyIteratorT` is mutable, and its `value_type` is a model of [LessThan Comparable](https://en.cppreference.com/w/cpp/named_req/LessThanComparable). This `value_type`'s ordering relation is a *strict weak ordering* as defined in the [LessThan Comparable](https://en.cppreference.com/w/cpp/named_req/LessThanComparable) requirements.
+
+
+
+Is a model of [Random Access Iterator](https://en.cppreference.com/w/cpp/iterator/random_access_iterator), and `ValueIteratorT` is mutable.
+
+
+
+Is an integer type for global offsets.
+
+
+
+Is a type of callable object with the signature `bool operator()(KeyT lhs, KeyT rhs)` that models the [Strict Weak Ordering](https://en.cppreference.com/w/cpp/concepts/strict_weak_order) concept.
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Pointer to the input sequence of unsorted input keys
+
+
+
+Pointer to the input sequence of unsorted input values
+
+
+
+Pointer to the output sequence of sorted input keys
+
+
+
+Pointer to the output sequence of sorted input values
+
+
+
+Number of items to sort
+
+
+
+Comparison function object which returns `true` if the first argument is ordered before the second
+
+
+
+**[optional]** CUDA stream to launch kernels within. Default is stream0.
+
+
+### SortKeys inline static
+
+Sorts items using a merge sorting method.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceMergeSort::SortKeys(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ KeyIteratorT d_keys,
+ OffsetT num_items,
+ CompareOpT compare_op,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+**Template parameters**
+
+
+Is a model of [Random Access Iterator](https://en.cppreference.com/w/cpp/iterator/random_access_iterator). `KeyIteratorT` is mutable, and its `value_type` is a model of [LessThan Comparable](https://en.cppreference.com/w/cpp/named_req/LessThanComparable). This `value_type`'s ordering relation is a *strict weak ordering* as defined in the [LessThan Comparable](https://en.cppreference.com/w/cpp/named_req/LessThanComparable) requirements.
+
+
+
+Is an integer type for global offsets.
+
+
+
+Is a type of callable object with the signature `bool operator()(KeyT lhs, KeyT rhs)` that models the [Strict Weak Ordering](https://en.cppreference.com/w/cpp/concepts/strict_weak_order) concept.
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Pointer to the input sequence of unsorted input keys
+
+
+
+Number of items to sort
+
+
+
+Comparison function object which returns true if the first argument is ordered before the second
+
+
+
+**[optional]** CUDA stream to launch kernels within. Default is stream0.
+
+
+### SortKeysCopy inline static
+
+Sorts items using a merge sorting method.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceMergeSort::SortKeysCopy(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ KeyInputIteratorT d_input_keys,
+ KeyIteratorT d_output_keys,
+ OffsetT num_items,
+ CompareOpT compare_op,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+**Template parameters**
+
+
+Is a model of [Random Access Iterator](https://en.cppreference.com/w/cpp/iterator/random_access_iterator). Its `value_type` is a model of [LessThan Comparable](https://en.cppreference.com/w/cpp/named_req/LessThanComparable). This `value_type`'s ordering relation is a *strict weak ordering* as defined in the [LessThan Comparable](https://en.cppreference.com/w/cpp/named_req/LessThanComparable) requirements.
+
+
+
+Is a model of [Random Access Iterator](https://en.cppreference.com/w/cpp/iterator/random_access_iterator). `KeyIteratorT` is mutable, and its `value_type` is a model of [LessThan Comparable](https://en.cppreference.com/w/cpp/named_req/LessThanComparable). This `value_type`'s ordering relation is a *strict weak ordering* as defined in the [LessThan Comparable](https://en.cppreference.com/w/cpp/named_req/LessThanComparable) requirements.
+
+
+
+Is an integer type for global offsets.
+
+
+
+Is a type of callable object with the signature `bool operator()(KeyT lhs, KeyT rhs)` that models the [Strict Weak Ordering](https://en.cppreference.com/w/cpp/concepts/strict_weak_order) concept.
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Pointer to the input sequence of unsorted input keys
+
+
+
+Pointer to the output sequence of sorted input keys
+
+
+
+Number of items to sort
+
+
+
+Comparison function object which returns true if the first argument is ordered before the second
+
+
+
+**[optional]** CUDA stream to launch kernels within. Default is stream0.
+
+
+### StableSortPairs inline static
+
+Sorts items using a merge sorting method.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceMergeSort::StableSortPairs(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ KeyIteratorT d_keys,
+ ValueIteratorT d_items,
+ OffsetT num_items,
+ CompareOpT compare_op,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+**Template parameters**
+
+
+Is a model of [Random Access Iterator](https://en.cppreference.com/w/cpp/iterator/random_access_iterator). `KeyIteratorT` is mutable, and its `value_type` is a model of [LessThan Comparable](https://en.cppreference.com/w/cpp/named_req/LessThanComparable). This `value_type`'s ordering relation is a *strict weak ordering* as defined in the [LessThan Comparable](https://en.cppreference.com/w/cpp/named_req/LessThanComparable) requirements.
+
+
+
+Is a model of [Random Access Iterator](https://en.cppreference.com/w/cpp/iterator/random_access_iterator), and `ValueIteratorT` is mutable.
+
+
+
+Is an integer type for global offsets.
+
+
+
+Is a type of callable object with the signature `bool operator()(KeyT lhs, KeyT rhs)` that models the [Strict Weak Ordering](https://en.cppreference.com/w/cpp/concepts/strict_weak_order) concept.
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Pointer to the input sequence of unsorted input keys
+
+
+
+Pointer to the input sequence of unsorted input values
+
+
+
+Number of items to sort
+
+
+
+Comparison function object which returns true if the first argument is ordered before the second
+
+
+
+**[optional]** CUDA stream to launch kernels within. Default is stream0.
+
+
+### StableSortKeys inline static
+
+Sorts items using a merge sorting method.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceMergeSort::StableSortKeys(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ KeyIteratorT d_keys,
+ OffsetT num_items,
+ CompareOpT compare_op,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+**Template parameters**
+
+
+Is a model of [Random Access Iterator](https://en.cppreference.com/w/cpp/iterator/random_access_iterator). `KeyIteratorT` is mutable, and its `value_type` is a model of [LessThan Comparable](https://en.cppreference.com/w/cpp/named_req/LessThanComparable). This `value_type`'s ordering relation is a *strict weak ordering* as defined in the [LessThan Comparable](https://en.cppreference.com/w/cpp/named_req/LessThanComparable) requirements.
+
+
+
+Is an integer type for global offsets.
+
+
+
+Is a type of callable object with the signature `bool operator()(KeyT lhs, KeyT rhs)` that models the [Strict Weak Ordering](https://en.cppreference.com/w/cpp/concepts/strict_weak_order) concept.
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Pointer to the input sequence of unsorted input keys
+
+
+
+Number of items to sort
+
+
+
+Comparison function object which returns true if the first argument is ordered before the second
+
+
+
+**[optional]** CUDA stream to launch kernels within. Default is stream0.
+
+
+### StableSortKeysCopy inline static
+
+Sorts items using a merge sorting method.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceMergeSort::StableSortKeysCopy(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ KeyInputIteratorT d_input_keys,
+ KeyIteratorT d_output_keys,
+ OffsetT num_items,
+ CompareOpT compare_op,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+**Template parameters**
+
+
+Is a model of [Random Access Iterator](https://en.cppreference.com/w/cpp/iterator/random_access_iterator). Its `value_type` is a model of [LessThan Comparable](https://en.cppreference.com/w/cpp/named_req/LessThanComparable). This `value_type`'s ordering relation is a *strict weak ordering* as defined in the [LessThan Comparable](https://en.cppreference.com/w/cpp/named_req/LessThanComparable) requirements.
+
+
+
+Is a model of [Random Access Iterator](https://en.cppreference.com/w/cpp/iterator/random_access_iterator). `KeyIteratorT` is mutable, and its `value_type` is a model of [LessThan Comparable](https://en.cppreference.com/w/cpp/named_req/LessThanComparable). This `value_type`'s ordering relation is a *strict weak ordering* as defined in the [LessThan Comparable](https://en.cppreference.com/w/cpp/named_req/LessThanComparable) requirements.
+
+
+
+Is an integer type for global offsets.
+
+
+
+Is a type of callable object with the signature `bool operator()(KeyT lhs, KeyT rhs)` that models the [Strict Weak Ordering](https://en.cppreference.com/w/cpp/concepts/strict_weak_order) concept.
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Pointer to the input sequence of unsorted input keys
+
+
+
+Pointer to the output sequence of sorted input keys
+
+
+
+Number of elements in d_input_keys to sort
+
+
+
+Comparison function object which returns true if the first argument is ordered before the second
+
+
+
+**[optional]** CUDA stream to launch kernels within. Default is stream0.
+
diff --git a/fern/cudapages/cub/cub/cub/DevicePartition.mdx b/fern/cudapages/cub/cub/cub/DevicePartition.mdx
new file mode 100644
index 0000000..5f4c5f3
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/DevicePartition.mdx
@@ -0,0 +1,730 @@
+---
+title: cub::DevicePartition
+description: ""
+---
+
+DevicePartition provides device-wide, parallel operations for partitioning sequences of data items residing within device-accessible memory.
+
+## Performance considerations
+
+@linear_performance{partition}
+
+---
+
+## Methods
+
+### partition_impl inline static
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DevicePartition::partition_impl(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ InputIteratorT d_in,
+ FlagIteratorT d_flags,
+ OutputIteratorT d_out,
+ NumSelectedIteratorT d_num_selected_out,
+ OffsetT num_items,
+ SelectOpT select_op,
+ cudaStream_t stream
+)
+```
+
+
+### IfNoNVTX inline static
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DevicePartition::IfNoNVTX(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ InputIteratorT d_in,
+ FirstOutputIteratorT d_first_part_out,
+ SecondOutputIteratorT d_second_part_out,
+ UnselectedOutputIteratorT d_unselected_out,
+ NumSelectedIteratorT d_num_selected_out,
+ NumItemsT num_items,
+ SelectFirstPartOp select_first_part_op,
+ SelectSecondPartOp select_second_part_op,
+ cudaStream_t stream = 0
+)
+```
+
+
+---
+
+## Static methods
+
+### Flagged inline static
+
+
+
+
+Uses the `d_flags` sequence to split the corresponding items from `d_in` into a partitioned sequence `d_out`. The total number of items copied into the first partition is written to `d_num_selected_out`.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DevicePartition::Flagged(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ InputIteratorT d_in,
+ FlagIterator d_flags,
+ OutputIteratorT d_out,
+ NumSelectedIteratorT d_num_selected_out,
+ NumItemsT num_items,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The value type of `d_flags` must be castable to `bool` (e.g., `bool`, `char`, `int`, etc.).
+Copies of the selected items are compacted into `d_out` and maintain their original relative ordering, however copies of the unselected items are compacted into the rear of `d_out` in reverse order.
+The range `[d_out, d_out + num_items)` shall not overlap `[d_in, d_in + num_items)` nor `[d_flags, d_flags + num_items)` in any way. The range `[d_in, d_in + num_items)` may overlap `[d_flags, d_flags + num_items)`.
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access input iterator type for reading input items (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access input iterator type for reading selection flags (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access output iterator type for writing output items (may be a simple pointer type)
+
+
+
+**[inferred]** Output iterator type for recording the number of items selected (may be a simple pointer type)
+
+
+
+**[inferred]** Type of num_items
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Pointer to the input sequence of data items
+
+
+
+Pointer to the input sequence of selection flags
+
+
+
+Pointer to the output sequence of partitioned data items
+
+
+
+Pointer to the output total number of items selected (i.e., the offset of the unselected partition)
+
+
+
+Total number of items to select from
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+**Example**
+
+The code snippet below illustrates the compaction of items selected from an `int` device vector.
+
+```cpp showLineNumbers={false}
+#include
+// or equivalently
+
+// Declare, allocate, and initialize device-accessible pointers for
+// input, flags, and output
+int num_items; // e.g., 8
+int *d_in; // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
+char *d_flags; // e.g., [1, 0, 0, 1, 0, 1, 1, 0]
+int *d_out; // e.g., [ , , , , , , , ]
+int *d_num_selected_out; // e.g., [ ]
+...
+
+// Determine temporary device storage requirements
+void *d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+cub::DevicePartition::Flagged(
+ d_temp_storage, temp_storage_bytes,
+ d_in, d_flags, d_out, d_num_selected_out, num_items);
+
+// Allocate temporary storage
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Run selection
+cub::DevicePartition::Flagged(
+ d_temp_storage, temp_storage_bytes,
+ d_in, d_flags, d_out, d_num_selected_out, num_items);
+
+// d_out <-- [1, 4, 6, 7, 8, 5, 3, 2]
+// d_num_selected_out <-- [4]
+```
+
+
+
+
+nodiscard
+
+Uses the `d_flags` sequence to split the corresponding items from `d_in` into a partitioned sequence `d_out`. The total number of items copied into the first partition is written to `d_num_selected_out`.
+
+This is an environment-based API that allows customization of:
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DevicePartition::Flagged(
+ InputIteratorT d_in,
+ FlagIterator d_flags,
+ OutputIteratorT d_out,
+ NumSelectedIteratorT d_num_selected_out,
+ NumItemsT num_items,
+ EnvT env = {}
+)
+```
+
+
+
+Stream: Query via `cuda::get_stream`
+Memory resource: Query via `cuda::mr::get_memory_resource`
+The value type of `d_flags` must be castable to `bool` (e.g., `bool`, `char`, `int`, etc.).
+Copies of the selected items are compacted into `d_out` and maintain their original relative ordering, however copies of the unselected items are compacted into the rear of `d_out` in reverse order.
+The range `[d_out, d_out + num_items)` shall not overlap `[d_in, d_in + num_items)` nor `[d_flags, d_flags + num_items)` in any way. The range `[d_in, d_in + num_items)` may overlap `[d_flags, d_flags + num_items)`.
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access input iterator type for reading input items (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access input iterator type for reading selection flags (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access output iterator type for writing output items (may be a simple pointer type)
+
+
+
+**[inferred]** Output iterator type for recording the number of items selected (may be a simple pointer type)
+
+
+
+**[inferred]** Type of num_items
+
+
+
+**[inferred]** Environment type (e.g., `cuda::std::execution::env<...>`)
+
+
+**Parameters**
+
+
+Pointer to the input sequence of data items
+
+
+
+Pointer to the input sequence of selection flags
+
+
+
+Pointer to the output sequence of partitioned data items
+
+
+
+Pointer to the output total number of items selected (i.e., the offset of the unselected partition)
+
+
+
+Total number of items to select from
+
+
+
+**[optional]** Execution environment. Default is `cuda::std::execution::env{}`.
+
+
+
+
+
+### If inline static
+
+
+
+
+Uses the `select_op` functor to split the corresponding items from `d_in` into a partitioned sequence `d_out`. The total number of items copied into the first partition is written to `d_num_selected_out`.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DevicePartition::If(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ InputIteratorT d_in,
+ OutputIteratorT d_out,
+ NumSelectedIteratorT d_num_selected_out,
+ NumItemsT num_items,
+ SelectOp select_op,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+Copies of the selected items are compacted into `d_out` and maintain their original relative ordering, however copies of the unselected items are compacted into the rear of `d_out` in reverse order.
+The range `[d_out, d_out + num_items)` shall not overlap `[d_in, d_in + num_items)` in any way.
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access input iterator type for reading input items (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access output iterator type for writing output items (may be a simple pointer type)
+
+
+
+**[inferred]** Output iterator type for recording the number of items selected (may be a simple pointer type)
+
+
+
+**[inferred]** Selection functor type having member `bool operator()(const T &a)`
+
+
+
+**[inferred]** Type of num_items
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Pointer to the input sequence of data items
+
+
+
+Pointer to the output sequence of partitioned data items
+
+
+
+Pointer to the output total number of items selected (i.e., the offset of the unselected partition)
+
+
+
+Total number of items to select from
+
+
+
+Unary selection operator
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+**Example**
+
+The code snippet below illustrates the compaction of items selected from an `int` device vector.
+
+```cpp showLineNumbers={false}
+#include
+// or equivalently
+
+// Functor type for selecting values less than some criteria
+struct LessThan
+{
+ int compare;
+
+ CUB_RUNTIME_FUNCTION __forceinline__
+ explicit LessThan(int compare) : compare(compare) {}
+
+ CUB_RUNTIME_FUNCTION __forceinline__
+ bool operator()(const int &a) const
+ {
+ return (a < compare);
+ }
+};
+
+// Declare, allocate, and initialize device-accessible pointers for
+// input and output
+int num_items; // e.g., 8
+int *d_in; // e.g., [0, 2, 3, 9, 5, 2, 81, 8]
+int *d_out; // e.g., [ , , , , , , , ]
+int *d_num_selected_out; // e.g., [ ]
+LessThan select_op(7);
+...
+
+// Determine temporary device storage requirements
+void *d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+cub::DevicePartition::If(
+d_temp_storage, temp_storage_bytes,
+d_in, d_out, d_num_selected_out, num_items, select_op);
+
+// Allocate temporary storage
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Run selection
+cub::DevicePartition::If(
+ d_temp_storage, temp_storage_bytes,
+ d_in, d_out, d_num_selected_out, num_items, select_op);
+
+// d_out <-- [0, 2, 3, 5, 2, 8, 81, 9]
+// d_num_selected_out <-- [5]
+```
+
+
+
+
+nodiscard
+
+Uses the `select_op` functor to split the corresponding items from `d_in` into a partitioned sequence `d_out`. The total number of items copied into the first partition is written to `d_num_selected_out`.
+
+This is an environment-based API that allows customization of:
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DevicePartition::If(
+ InputIteratorT d_in,
+ OutputIteratorT d_out,
+ NumSelectedIteratorT d_num_selected_out,
+ NumItemsT num_items,
+ SelectOp select_op,
+ EnvT env = {}
+)
+```
+
+
+
+Stream: Query via `cuda::get_stream`
+Memory resource: Query via `cuda::mr::get_memory_resource`
+Copies of the selected items are compacted into `d_out` and maintain their original relative ordering, however copies of the unselected items are compacted into the rear of `d_out` in reverse order.
+The range `[d_out, d_out + num_items)` shall not overlap `[d_in, d_in + num_items)` in any way.
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access input iterator type for reading input items (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access output iterator type for writing output items (may be a simple pointer type)
+
+
+
+**[inferred]** Output iterator type for recording the number of items selected (may be a simple pointer type)
+
+
+
+**[inferred]** Selection functor type having member `bool operator()(const T &a)`
+
+
+
+**[inferred]** Type of num_items
+
+
+
+**[inferred]** Environment type (e.g., `cuda::std::execution::env<...>`)
+
+
+**Parameters**
+
+
+Pointer to the input sequence of data items
+
+
+
+Pointer to the output sequence of partitioned data items
+
+
+
+Pointer to the output total number of items selected (i.e., the offset of the unselected partition)
+
+
+
+Total number of items to select from
+
+
+
+Unary selection operator
+
+
+
+**[optional]** Execution environment. Default is `cuda::std::execution::env{}`.
+
+
+
+
+
+Uses two functors to split the corresponding items from `d_in` into a three partitioned sequences `d_first_part_out`, `d_second_part_out`, and `d_unselected_out`. The total number of items copied into the first partition is written to `d_num_selected_out[0]`, while the total number of items copied into the second partition is written to `d_num_selected_out[1]`.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DevicePartition::If(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ InputIteratorT d_in,
+ FirstOutputIteratorT d_first_part_out,
+ SecondOutputIteratorT d_second_part_out,
+ UnselectedOutputIteratorT d_unselected_out,
+ NumSelectedIteratorT d_num_selected_out,
+ NumItemsT num_items,
+ SelectFirstPartOp select_first_part_op,
+ SelectSecondPartOp select_second_part_op,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+Copies of the items selected by `select_first_part_op` are compacted into `d_first_part_out` and maintain their original relative ordering.
+Copies of the items selected by `select_second_part_op` are compacted into `d_second_part_out` and maintain their original relative ordering.
+Copies of the unselected items are compacted into the `d_unselected_out` in reverse order.
+The ranges `[d_out, d_out + num_items)`, `[d_first_part_out, d_first_part_out + d_num_selected_out[0])`, `[d_second_part_out, d_second_part_out + d_num_selected_out[1])`, `[d_unselected_out, d_unselected_out + num_items - d_num_selected_out[0] - d_num_selected_out[1])`, shall not overlap in any way.
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access input iterator type for reading input items (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access output iterator type for writing output items selected by first operator (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access output iterator type for writing output items selected by second operator (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access output iterator type for writing unselected items (may be a simple pointer type)
+
+
+
+**[inferred]** Output iterator type for recording the number of items selected (may be a simple pointer type)
+
+
+
+**[inferred]** Selection functor type having member `bool operator()(const T &a)`
+
+
+
+**[inferred]** Selection functor type having member `bool operator()(const T &a)`
+
+
+
+**[inferred]** Type of num_items
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Pointer to the input sequence of data items
+
+
+
+Pointer to the output sequence of data items selected by `select_first_part_op`
+
+
+
+Pointer to the output sequence of data items selected by `select_second_part_op`
+
+
+
+Pointer to the output sequence of unselected data items
+
+
+
+Pointer to the output array with two elements, where total number of items selected by `select_first_part_op` is stored as `d_num_selected_out[0]` and total number of items selected by `select_second_part_op` is stored as `d_num_selected_out[1]`, respectively
+
+
+
+Total number of items to select from
+
+
+
+Unary selection operator to select `d_first_part_out`
+
+
+
+Unary selection operator to select `d_second_part_out`
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+**Example**
+
+The code snippet below illustrates how this algorithm can partition an input vector into small, medium, and large items so that the relative order of items remain deterministic.
+
+Let's consider any value that doesn't exceed six a small one. On the other hand, any value that exceeds 50 will be considered a large one. Since the value used to define a small part doesn't match one that defines the large part, the intermediate segment is implied.
+
+These definitions partition a value space into three categories. We want to preserve the order of items in which they appear in the input vector. Since the algorithm provides stable partitioning, this is possible.
+
+Since the number of items in each category is unknown beforehand, we need three output arrays of num_items elements each. To reduce the memory requirements, we can combine the output storage for two categories.
+
+Since each value falls precisely in one category, it's safe to add "large" values into the head of the shared output vector and the "middle" values into its tail. To add items into the tail of the output array, we can use `cuda::std::reverse_iterator`.
+
+```cpp showLineNumbers={false}
+#include
+// or equivalently
+
+// Functor type for selecting values less than some criteria
+struct LessThan
+{
+ int compare;
+
+ __host__ __device__ __forceinline__
+ explicit LessThan(int compare) : compare(compare) {}
+
+ __host__ __device__ __forceinline__
+ bool operator()(const int &a) const
+ {
+ return a < compare;
+ }
+};
+
+// Functor type for selecting values greater than some criteria
+struct GreaterThan
+{
+ int compare;
+
+ __host__ __device__ __forceinline__
+ explicit GreaterThan(int compare) : compare(compare) {}
+
+ __host__ __device__ __forceinline__
+ bool operator()(const int &a) const
+ {
+ return a > compare;
+ }
+};
+
+// Declare, allocate, and initialize device-accessible pointers for
+// input and output
+int num_items; // e.g., 8
+int *d_in; // e.g., [0, 2, 3, 9, 5, 2, 81, 8]
+int *d_large_and_unselected_out; // e.g., [ , , , , , , , ]
+int *d_small_out; // e.g., [ , , , , , , , ]
+int *d_num_selected_out; // e.g., [ , ]
+cud::std::reverse_iterator unselected_out(d_large_and_unselected_out + num_items);
+LessThan small_items_selector(7);
+GreaterThan large_items_selector(50);
+...
+
+// Determine temporary device storage requirements
+void *d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+cub::DevicePartition::If(
+ d_temp_storage, temp_storage_bytes,
+ d_in, d_large_and_medium_out, d_small_out, unselected_out,
+ d_num_selected_out, num_items,
+ large_items_selector, small_items_selector);
+
+// Allocate temporary storage
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Run selection
+cub::DevicePartition::If(
+ d_temp_storage, temp_storage_bytes,
+ d_in, d_large_and_medium_out, d_small_out, unselected_out,
+ d_num_selected_out, num_items,
+ large_items_selector, small_items_selector);
+
+// d_large_and_unselected_out <-- [ 81, , , , , , 8, 9 ]
+// d_small_out <-- [ 0, 2, 3, 5, 2, , , ]
+// d_num_selected_out <-- [ 1, 5 ]
+```
+
+
+
diff --git a/fern/cudapages/cub/cub/cub/DeviceRadixSort.mdx b/fern/cudapages/cub/cub/cub/DeviceRadixSort.mdx
new file mode 100644
index 0000000..8ae872a
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/DeviceRadixSort.mdx
@@ -0,0 +1,2874 @@
+---
+title: cub::DeviceRadixSort
+description: ""
+---
+
+DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data items residing within device-accessible memory.
+
+
+
+The [radix sorting method](http://en.wikipedia.org/wiki/Radix_sort) arranges items into ascending (or descending) order. The algorithm relies upon a positional representation for keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits, characters, etc.) specified from least-significant to most-significant. For a given input sequence of keys and a set of rules specifying a total ordering of the symbolic alphabet, the radix sorting method produces a lexicographic ordering of those keys.
+
+Assumes threads are in row-major order.
+
+
+DeviceRadixSort can sort all of the built-in C++ numeric primitive types (`unsigned char`, `int`, `double`, etc.) as well as CUDA's `__half` and `__nv_bfloat16` 16-bit floating-point types. User-defined types are supported as long as a decomposer object is provided.
+
+
+- Positive and negative zeros are considered equivalent, and will be treated
+ as such in the output.
+- No special handling is implemented for NaN values; these are sorted
+ according to their bit representations after any transformations.
+
+
+Although the direct radix sorting method can only be applied to unsigned integral types, DeviceRadixSort is able to sort signed and floating-point types via simple bit-wise transformations that ensure lexicographic key ordering. Additional transformations occur for descending sorts. These transformations must be considered when restricting the `[begin_bit, end_bit)` range, as the bitwise transformations will occur before the bit-range truncation.
+
+Any transformations applied to the keys prior to sorting are reversed while writing to the final output buffer.
+
+
+To convert the input values into a radix-sortable bitwise representation, the following transformations take place prior to sorting:
+
+- For unsigned integral values, the keys are used directly.
+- For signed integral values, the sign bit is inverted.
+- For positive floating point values, the sign bit is inverted.
+- For negative floating point values, the full key is inverted.
+
+For floating point types, positive and negative zero are a special case and will be considered equivalent during sorting.
+
+
+If descending sort is used, the keys are inverted after performing any type-specific transformations, and the resulting keys are sorted in ascending order.
+
+
+DeviceRadixSort is stable. For floating-point types, `-0.0` and `+0.0` are considered equal and appear in the result in the same order as they appear in the input.
+
+
+@cdp_class{DeviceRadixSort}
+
+
+@linear_performance{radix sort}
+
+---
+
+## KeyT-value pairs
+
+### SortPairs inline static
+
+
+
+
+Sorts key-value pairs into ascending order using :math:`\approx 2N` auxiliary storage.
+
+The code snippet below illustrates the sorting of a device vector of `int` keys with associated vector of `int` values.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceRadixSort::SortPairs(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ const KeyT *d_keys_in,
+ KeyT *d_keys_out,
+ const ValueT *d_values_in,
+ ValueT *d_values_out,
+ NumItemsT num_items,
+ int begin_bit = 0,
+ int end_bit = sizeof(KeyT) *8,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The contents of the input data are not altered by the sorting operation.
+Pointers to contiguous memory must be used; iterators are not currently supported.
+In-place operations are not supported. There must be no overlap between any of the provided ranges:
+`[d_keys_in, d_keys_in + num_items)`
+`[d_keys_out, d_keys_out + num_items)`
+`[d_values_in, d_values_in + num_items)`
+`[d_values_out, d_values_out + num_items)`
+An optional bit subrange `[begin_bit, end_bit)` of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement.
+@devicestorageNP For sorting using only `O(P)` temporary storage, see the sorting interface using DoubleBuffer wrappers below.
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** KeyT type
+
+
+
+**[inferred]** ValueT type
+
+
+
+**[inferred]** Type of num_items
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Pointer to the input data of key data to sort
+
+
+
+Pointer to the sorted output sequence of key data
+
+
+
+Pointer to the corresponding input sequence of associated value items
+
+
+
+Pointer to the correspondingly-reordered output sequence of associated value items
+
+
+
+Number of items to sort
+
+
+
+**[optional]** The least-significant bit index (inclusive) needed for key comparison
+
+
+
+**[optional]** The most-significant bit index (exclusive) needed for key comparison (e.g., `sizeof(unsigned int) * 8`)
+
+
+
+**[optional]** CUDA stream to launch kernels within. Default is stream0.
+
+
+**Example**
+
+```cpp showLineNumbers={false}
+#include
+// or equivalently
+
+// Declare, allocate, and initialize device-accessible pointers
+// for sorting data
+int num_items; // e.g., 7
+int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
+int *d_keys_out; // e.g., [ ... ]
+int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6]
+int *d_values_out; // e.g., [ ... ]
+...
+
+// Determine temporary device storage requirements
+void *d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes,
+ d_keys_in, d_keys_out, d_values_in, d_values_out, num_items);
+
+// Allocate temporary storage
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Run sorting operation
+cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes,
+ d_keys_in, d_keys_out, d_values_in, d_values_out, num_items);
+
+// d_keys_out <-- [0, 3, 5, 6, 7, 8, 9]
+// d_values_out <-- [5, 4, 3, 1, 2, 0, 6]
+```
+
+
+
+
+nodiscard
+
+Sorts key-value pairs into ascending order using :math:`\approx 2N` auxiliary storage.
+
+This is an environment-based API that allows customization of:
+
+The code snippet below illustrates the env-based sorting of key-value pairs:
+
+.. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_env_api.cu :language: c++ :dedent: :start-after: example-begin radix-sort-pairs-env :end-before: example-end radix-sort-pairs-env
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceRadixSort::SortPairs(
+ const KeyT *d_keys_in,
+ KeyT *d_keys_out,
+ const ValueT *d_values_in,
+ ValueT *d_values_out,
+ NumItemsT num_items,
+ int begin_bit = 0,
+ int end_bit = sizeof(KeyT) *8,
+ EnvT env = {}
+)
+```
+
+
+*Added in v3.4.0. First appears in CUDA Toolkit 13.4.*
+
+
+Stream: Query via `cuda::get_stream`
+Memory resource: Query via `cuda::mr::get_memory_resource`
+The contents of the input data are not altered by the sorting operation.
+Pointers to contiguous memory must be used; iterators are not currently supported.
+In-place operations are not supported. There must be no overlap between any of the provided ranges:
+`[d_keys_in, d_keys_in + num_items)`
+`[d_keys_out, d_keys_out + num_items)`
+`[d_values_in, d_values_in + num_items)`
+`[d_values_out, d_values_out + num_items)`
+An optional bit subrange `[begin_bit, end_bit)` of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement.
+
+
+**Template parameters**
+
+
+**[inferred]** KeyT type
+
+
+
+**[inferred]** ValueT type
+
+
+
+**[inferred]** Type of num_items
+
+
+
+**[inferred]** Environment type (e.g., `cuda::std::execution::env<...>`)
+
+
+**Parameters**
+
+
+Pointer to the input data of key data to sort
+
+
+
+Pointer to the sorted output sequence of key data
+
+
+
+Pointer to the corresponding input sequence of associated value items
+
+
+
+Pointer to the correspondingly-reordered output sequence of associated value items
+
+
+
+Number of items to sort
+
+
+
+The least-significant bit index (inclusive) needed for key comparison
+
+
+
+The most-significant bit index (exclusive) needed for key comparison (e.g., `sizeof(unsigned int) * 8`)
+
+
+
+**[optional]** Execution environment. Default is `cuda::std::execution::env{}`.
+
+
+
+
+
+Sorts key-value pairs into ascending order using :math:`\approx 2N` auxiliary storage.
+
+* The contents of the input data are not altered by the sorting operation. * Pointers to contiguous memory must be used; iterators are not currently supported. * In-place operations are not supported. There must be no overlap between any of the provided ranges:
+
+* `[d_keys_in, d_keys_in + num_items)` * `[d_keys_out, d_keys_out + num_items)` * `[d_values_in, d_values_in + num_items)` * `[d_values_out, d_values_out + num_items)`
+
+* A bit subrange `[begin_bit, end_bit)` is provided to specify differentiating key bits. This can reduce overall sorting overhead and yield a corresponding performance improvement. * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see the sorting interface using DoubleBuffer wrappers below. * @devicestorage
+
+Let's consider a user-defined `custom_t` type below. To sort an array of `custom_t` objects, we have to tell CUB about relevant members of the `custom_t` type. We do this by providing a decomposer that returns a tuple of references to relevant members of the key.
+
+.. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu :language: c++ :dedent: :start-after: example-begin custom-type :end-before: example-end custom-type
+
+The following snippet shows how to sort an array of `custom_t` objects using `cub::DeviceRadixSort::SortPairs`:
+
+.. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu :language: c++ :dedent: :start-after: example-begin pairs-bits :end-before: example-end pairs-bits
+
+
+```cpp showLineNumbers={false}
+template
+static ::cuda::std::enable_if_t, cudaError_t> cub::DeviceRadixSort::SortPairs(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ const KeyT *d_keys_in,
+ KeyT *d_keys_out,
+ const ValueT *d_values_in,
+ ValueT *d_values_out,
+ NumItemsT num_items,
+ DecomposerT decomposer,
+ int begin_bit,
+ int end_bit,
+ cudaStream_t stream = 0
+)
+```
+
+
+**Template parameters**
+
+
+**[inferred]** KeyT type
+
+
+
+**[inferred]** ValueT type
+
+
+
+**[inferred]** Type of num_items
+
+
+
+**[inferred]** Type of a callable object responsible for decomposing a `KeyT` into a tuple of references to its constituent arithmetic types: `::cuda::std::tuple operator()(KeyT &key)`. The leftmost element of the tuple is considered the most significant. The call operator must not modify members of the key.
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Pointer to the input data of key data to sort
+
+
+
+Pointer to the sorted output sequence of key data
+
+
+
+Pointer to the corresponding input sequence of associated value items
+
+
+
+Pointer to the correspondingly-reordered output sequence of associated value items
+
+
+
+Number of items to sort
+
+
+
+Callable object responsible for decomposing a `KeyT` into a tuple of references to its constituent arithmetic types. The leftmost element of the tuple is considered the most significant. The call operator must not modify members of the key.
+
+
+
+**[optional]** The least-significant bit index (inclusive) needed for key comparison
+
+
+
+**[optional]** The most-significant bit index (exclusive) needed for key comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
+
+
+
+**[optional]** CUDA stream to launch kernels within. Default is stream0.
+
+
+
+
+
+Sorts key-value pairs into ascending order using :math:`\approx 2N` auxiliary storage.
+
+* The contents of the input data are not altered by the sorting operation. * Pointers to contiguous memory must be used; iterators are not currently supported. * In-place operations are not supported. There must be no overlap between any of the provided ranges:
+
+* `[d_keys_in, d_keys_in + num_items)` * `[d_keys_out, d_keys_out + num_items)` * `[d_values_in, d_values_in + num_items)` * `[d_values_out, d_values_out + num_items)`
+
+* @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see the sorting interface using DoubleBuffer wrappers below. * @devicestorage
+
+Let's consider a user-defined `custom_t` type below. To sort an array of `custom_t` objects, we have to tell CUB about relevant members of the `custom_t` type. We do this by providing a decomposer that returns a tuple of references to relevant members of the key.
+
+.. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu :language: c++ :dedent: :start-after: example-begin custom-type :end-before: example-end custom-type
+
+The following snippet shows how to sort an array of `custom_t` objects using `cub::DeviceRadixSort::SortPairs`:
+
+.. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu :language: c++ :dedent: :start-after: example-begin pairs :end-before: example-end pairs
+
+
+```cpp showLineNumbers={false}
+template
+static ::cuda::std::enable_if_t, cudaError_t> cub::DeviceRadixSort::SortPairs(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ const KeyT *d_keys_in,
+ KeyT *d_keys_out,
+ const ValueT *d_values_in,
+ ValueT *d_values_out,
+ NumItemsT num_items,
+ DecomposerT decomposer,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+**Template parameters**
+
+
+**[inferred]** KeyT type
+
+
+
+**[inferred]** ValueT type
+
+
+
+**[inferred]** Type of num_items
+
+
+
+**[inferred]** Type of a callable object responsible for decomposing a `KeyT` into a tuple of references to its constituent arithmetic types: `::cuda::std::tuple operator()(KeyT &key)`. The leftmost element of the tuple is considered the most significant. The call operator must not modify members of the key.
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Pointer to the input data of key data to sort
+
+
+
+Pointer to the sorted output sequence of key data
+
+
+
+Pointer to the corresponding input sequence of associated value items
+
+
+
+Pointer to the correspondingly-reordered output sequence of associated value items
+
+
+
+Number of items to sort
+
+
+
+Callable object responsible for decomposing a `KeyT` into a tuple of references to its constituent arithmetic types. The leftmost element of the tuple is considered the most significant. The call operator must not modify members of the key.
+
+
+
+**[optional]** CUDA stream to launch kernels within. Default is stream0.
+
+
+
+
+
+Sorts key-value pairs into ascending order using :math:`\approx N` auxiliary storage.
+
+The code snippet below illustrates the sorting of a device vector of `int` keys with associated vector of `int` values.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceRadixSort::SortPairs(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ DoubleBuffer &d_keys,
+ DoubleBuffer &d_values,
+ NumItemsT num_items,
+ int begin_bit = 0,
+ int end_bit = sizeof(KeyT) *8,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The sorting operation is given a pair of key buffers and a corresponding pair of associated value buffers. Each pair is managed by a DoubleBuffer structure that indicates which of the two buffers is "current" (and thus contains the input data to be sorted).
+The contents of both buffers within each pair may be altered by the sorting operation.
+In-place operations are not supported. There must be no overlap between any of the provided ranges:
+`[d_keys.Current(), d_keys.Current() + num_items)`
+`[d_keys.Alternate(), d_keys.Alternate() + num_items)`
+`[d_values.Current(), d_values.Current() + num_items)`
+`[d_values.Alternate(), d_values.Alternate() + num_items)`
+Upon completion, the sorting operation will update the "current" indicator within each DoubleBuffer wrapper to reference which of the two buffers now contains the sorted output sequence (a function of the number of key bits specified and the targeted device architecture).
+An optional bit subrange `[begin_bit, end_bit)` of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement.
+@devicestorageP
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** KeyT type
+
+
+
+**[inferred]** ValueT type
+
+
+
+**[inferred]** Type of num_items
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+
+
+
+Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
+
+
+
+Number of items to sort
+
+
+
+**[optional]** The least-significant bit index (inclusive) needed for key comparison
+
+
+
+**[optional]** The most-significant bit index (exclusive) needed for key comparison (e.g., `sizeof(unsigned int) * 8`)
+
+
+
+**[optional]** CUDA stream to launch kernels within. Default is stream0.
+
+
+**Example**
+
+```cpp showLineNumbers={false}
+#include
+// or equivalently
+
+// Declare, allocate, and initialize device-accessible pointers for
+// sorting data
+int num_items; // e.g., 7
+int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9]
+int *d_key_alt_buf; // e.g., [ ... ]
+int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6]
+int *d_value_alt_buf; // e.g., [ ... ]
+...
+
+// Create a set of DoubleBuffers to wrap pairs of device pointers
+cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf);
+cub::DoubleBuffer d_values(d_value_buf, d_value_alt_buf);
+
+// Determine temporary device storage requirements
+void *d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+cub::DeviceRadixSort::SortPairs(
+ d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
+
+// Allocate temporary storage
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Run sorting operation
+cub::DeviceRadixSort::SortPairs(
+ d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
+
+// d_keys.Current() <-- [0, 3, 5, 6, 7, 8, 9]
+// d_values.Current() <-- [5, 4, 3, 1, 2, 0, 6]
+```
+
+
+
+
+nodiscard
+
+Sorts key-value pairs into ascending order using :math:`\approx N` auxiliary storage.
+
+This is an environment-based API that allows customization of:
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceRadixSort::SortPairs(
+ DoubleBuffer &d_keys,
+ DoubleBuffer &d_values,
+ NumItemsT num_items,
+ int begin_bit = 0,
+ int end_bit = sizeof(KeyT) *8,
+ EnvT env = {}
+)
+```
+
+
+*Added in v3.4.0. First appears in CUDA Toolkit 13.4.*
+
+
+Stream: Query via `cuda::get_stream`
+Memory resource: Query via `cuda::mr::get_memory_resource`
+The sorting operation is given a pair of key buffers and a corresponding pair of associated value buffers. Each pair is managed by a DoubleBuffer structure that indicates which of the two buffers is "current" (and thus contains the input data to be sorted).
+The contents of both buffers within each pair may be altered by the sorting operation.
+In-place operations are not supported. There must be no overlap between any of the provided ranges:
+`[d_keys.Current(), d_keys.Current() + num_items)`
+`[d_keys.Alternate(), d_keys.Alternate() + num_items)`
+`[d_values.Current(), d_values.Current() + num_items)`
+`[d_values.Alternate(), d_values.Alternate() + num_items)`
+Upon completion, the sorting operation will update the "current" indicator within each DoubleBuffer wrapper to reference which of the two buffers now contains the sorted output sequence (a function of the number of key bits specified and the targeted device architecture).
+An optional bit subrange `[begin_bit, end_bit)` of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement.
+
+
+**Template parameters**
+
+
+**[inferred]** KeyT type
+
+
+
+**[inferred]** ValueT type
+
+
+
+**[inferred]** Type of num_items
+
+
+
+**[inferred]** Environment type (e.g., `cuda::std::execution::env<...>`)
+
+
+**Parameters**
+
+
+Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+
+
+
+Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
+
+
+
+Number of items to sort
+
+
+
+The least-significant bit index (inclusive) needed for key comparison
+
+
+
+The most-significant bit index (exclusive) needed for key comparison (e.g., `sizeof(unsigned int) * 8`)
+
+
+
+**[optional]** Execution environment. Default is `cuda::std::execution::env{}`.
+
+
+
+
+
+Sorts key-value pairs into ascending order using :math:`\approx N` auxiliary storage.
+
+* The sorting operation is given a pair of key buffers and a corresponding pair of associated value buffers. Each pair is managed by a DoubleBuffer structure that indicates which of the two buffers is "current" (and thus contains the input data to be sorted). * The contents of both buffers within each pair may be altered by the sorting operation. * In-place operations are not supported. There must be no overlap between any of the provided ranges:
+
+Let's consider a user-defined `custom_t` type below. To sort an array of `custom_t` objects, we have to tell CUB about relevant members of the `custom_t` type. We do this by providing a decomposer that returns a tuple of references to relevant members of the key.
+
+.. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu :language: c++ :dedent: :start-after: example-begin custom-type :end-before: example-end custom-type
+
+The following snippet shows how to sort an array of `custom_t` objects using `cub::DeviceRadixSort::SortPairs`:
+
+.. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu :language: c++ :dedent: :start-after: example-begin pairs-db :end-before: example-end pairs-db
+
+
+```cpp showLineNumbers={false}
+template
+static ::cuda::std::enable_if_t, cudaError_t> cub::DeviceRadixSort::SortPairs(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ DoubleBuffer &d_keys,
+ DoubleBuffer &d_values,
+ NumItemsT num_items,
+ DecomposerT decomposer,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+`[d_keys.Current(), d_keys.Current() + num_items)`
+`[d_keys.Alternate(), d_keys.Alternate() + num_items)`
+`[d_values.Current(), d_values.Current() + num_items)`
+`[d_values.Alternate(), d_values.Alternate() + num_items)`
+Upon completion, the sorting operation will update the "current" indicator within each DoubleBuffer wrapper to reference which of the two buffers now contains the sorted output sequence (a function of the number of key bits specified and the targeted device architecture).
+@devicestorageP
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** KeyT type
+
+
+
+**[inferred]** ValueT type
+
+
+
+**[inferred]** Type of num_items
+
+
+
+**[inferred]** Type of a callable object responsible for decomposing a `KeyT` into a tuple of references to its constituent arithmetic types: `::cuda::std::tuple operator()(KeyT &key)`. The leftmost element of the tuple is considered the most significant. The call operator must not modify members of the key.
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+
+
+
+Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
+
+
+
+Number of items to sort
+
+
+
+Callable object responsible for decomposing a `KeyT` into a tuple of references to its constituent arithmetic types. The leftmost element of the tuple is considered the most significant. The call operator must not modify members of the key.
+
+
+
+**[optional]** CUDA stream to launch kernels within. Default is stream0.
+
+
+
+
+
+Sorts key-value pairs into ascending order using :math:`\approx N` auxiliary storage.
+
+* The sorting operation is given a pair of key buffers and a corresponding pair of associated value buffers. Each pair is managed by a DoubleBuffer structure that indicates which of the two buffers is "current" (and thus contains the input data to be sorted). * The contents of both buffers within each pair may be altered by the sorting operation. * In-place operations are not supported. There must be no overlap between any of the provided ranges:
+
+Let's consider a user-defined `custom_t` type below. To sort an array of `custom_t` objects, we have to tell CUB about relevant members of the `custom_t` type. We do this by providing a decomposer that returns a tuple of references to relevant members of the key.
+
+.. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu :language: c++ :dedent: :start-after: example-begin custom-type :end-before: example-end custom-type
+
+The following snippet shows how to sort an array of `custom_t` objects using `cub::DeviceRadixSort::SortPairs`:
+
+.. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu :language: c++ :dedent: :start-after: example-begin pairs-bits-db :end-before: example-end pairs-bits-db
+
+
+```cpp showLineNumbers={false}
+template
+static ::cuda::std::enable_if_t, cudaError_t> cub::DeviceRadixSort::SortPairs(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ DoubleBuffer &d_keys,
+ DoubleBuffer &d_values,
+ NumItemsT num_items,
+ DecomposerT decomposer,
+ int begin_bit,
+ int end_bit,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+`[d_keys.Current(), d_keys.Current() + num_items)`
+`[d_keys.Alternate(), d_keys.Alternate() + num_items)`
+`[d_values.Current(), d_values.Current() + num_items)`
+`[d_values.Alternate(), d_values.Alternate() + num_items)`
+Upon completion, the sorting operation will update the "current" indicator within each DoubleBuffer wrapper to reference which of the two buffers now contains the sorted output sequence (a function of the number of key bits specified and the targeted device architecture).
+An optional bit subrange `[begin_bit, end_bit)` of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement.
+@devicestorageP
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** KeyT type
+
+
+
+**[inferred]** ValueT type
+
+
+
+**[inferred]** Type of num_items
+
+
+
+**[inferred]** Type of a callable object responsible for decomposing a `KeyT` into a tuple of references to its constituent arithmetic types: `::cuda::std::tuple operator()(KeyT &key)`. The leftmost element of the tuple is considered the most significant. The call operator must not modify members of the key.
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+
+
+
+Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
+
+
+
+Number of items to sort
+
+
+
+Callable object responsible for decomposing a `KeyT` into a tuple of references to its constituent arithmetic types. The leftmost element of the tuple is considered the most significant. The call operator must not modify members of the key.
+
+
+
+**[optional]** The least-significant bit index (inclusive) needed for key comparison
+
+
+
+**[optional]** The most-significant bit index (exclusive) needed for key comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
+
+
+
+**[optional]** CUDA stream to launch kernels within. Default is stream0.
+
+
+
+
+
+### SortPairsDescending inline static
+
+
+
+
+Sorts key-value pairs into descending order using :math:`\approx 2N` auxiliary storage.
+
+The code snippet below illustrates the sorting of a device vector of `int` keys with associated vector of `int` values.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceRadixSort::SortPairsDescending(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ const KeyT *d_keys_in,
+ KeyT *d_keys_out,
+ const ValueT *d_values_in,
+ ValueT *d_values_out,
+ NumItemsT num_items,
+ int begin_bit = 0,
+ int end_bit = sizeof(KeyT) *8,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The contents of the input data are not altered by the sorting operation.
+Pointers to contiguous memory must be used; iterators are not currently supported.
+In-place operations are not supported. There must be no overlap between any of the provided ranges:
+`[d_keys_in, d_keys_in + num_items)`
+`[d_keys_out, d_keys_out + num_items)`
+`[d_values_in, d_values_in + num_items)`
+`[d_values_out, d_values_out + num_items)`
+An optional bit subrange `[begin_bit, end_bit)` of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement.
+@devicestorageNP For sorting using only `O(P)` temporary storage, see the sorting interface using DoubleBuffer wrappers below.
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** KeyT type
+
+
+
+**[inferred]** ValueT type
+
+
+
+**[inferred]** Type of num_items
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Pointer to the input data of key data to sort
+
+
+
+Pointer to the sorted output sequence of key data
+
+
+
+Pointer to the corresponding input sequence of associated value items
+
+
+
+Pointer to the correspondingly-reordered output sequence of associated value items
+
+
+
+Number of items to sort
+
+
+
+**[optional]** The least-significant bit index (inclusive) needed for key comparison
+
+
+
+**[optional]** The most-significant bit index (exclusive) needed for key comparison (e.g., `sizeof(unsigned int) * 8`)
+
+
+
+**[optional]** CUDA stream to launch kernels within. Default is stream0.
+
+
+**Example**
+
+```cpp showLineNumbers={false}
+#include
+// or equivalently
+
+// Declare, allocate, and initialize device-accessible pointers
+// for sorting data
+int num_items; // e.g., 7
+int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
+int *d_keys_out; // e.g., [ ... ]
+int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6]
+int *d_values_out; // e.g., [ ... ]
+...
+
+// Determine temporary device storage requirements
+void *d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+cub::DeviceRadixSort::SortPairsDescending(
+ d_temp_storage, temp_storage_bytes,
+ d_keys_in, d_keys_out, d_values_in, d_values_out, num_items);
+
+// Allocate temporary storage
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Run sorting operation
+cub::DeviceRadixSort::SortPairsDescending(
+ d_temp_storage, temp_storage_bytes,
+ d_keys_in, d_keys_out, d_values_in, d_values_out, num_items);
+
+// d_keys_out <-- [9, 8, 7, 6, 5, 3, 0]
+// d_values_out <-- [6, 0, 2, 1, 3, 4, 5]
+```
+
+
+
+
+nodiscard
+
+Sorts key-value pairs into descending order using :math:`\approx 2N` auxiliary storage.
+
+This is an environment-based API that allows customization of:
+
+The code snippet below illustrates the env-based descending sort of key-value pairs:
+
+.. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_env_api.cu :language: c++ :dedent: :start-after: example-begin radix-sort-pairs-descending-env :end-before: example-end radix-sort-pairs-descending-env
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceRadixSort::SortPairsDescending(
+ const KeyT *d_keys_in,
+ KeyT *d_keys_out,
+ const ValueT *d_values_in,
+ ValueT *d_values_out,
+ NumItemsT num_items,
+ int begin_bit = 0,
+ int end_bit = sizeof(KeyT) *8,
+ EnvT env = {}
+)
+```
+
+
+*Added in v3.4.0. First appears in CUDA Toolkit 13.4.*
+
+
+Stream: Query via `cuda::get_stream`
+Memory resource: Query via `cuda::mr::get_memory_resource`
+The contents of the input data are not altered by the sorting operation.
+Pointers to contiguous memory must be used; iterators are not currently supported.
+In-place operations are not supported. There must be no overlap between any of the provided ranges:
+`[d_keys_in, d_keys_in + num_items)`
+`[d_keys_out, d_keys_out + num_items)`
+`[d_values_in, d_values_in + num_items)`
+`[d_values_out, d_values_out + num_items)`
+An optional bit subrange `[begin_bit, end_bit)` of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement.
+
+
+**Template parameters**
+
+
+**[inferred]** KeyT type
+
+
+
+**[inferred]** ValueT type
+
+
+
+**[inferred]** Type of num_items
+
+
+
+**[inferred]** Environment type (e.g., `cuda::std::execution::env<...>`)
+
+
+**Parameters**
+
+
+Pointer to the input data of key data to sort
+
+
+
+Pointer to the sorted output sequence of key data
+
+
+
+Pointer to the corresponding input sequence of associated value items
+
+
+
+Pointer to the correspondingly-reordered output sequence of associated value items
+
+
+
+Number of items to sort
+
+
+
+The least-significant bit index (inclusive) needed for key comparison
+
+
+
+The most-significant bit index (exclusive) needed for key comparison (e.g., `sizeof(unsigned int) * 8`)
+
+
+
+**[optional]** Execution environment. Default is `cuda::std::execution::env{}`.
+
+
+
+
+
+Sorts key-value pairs into descending order using :math:`\approx 2N` auxiliary storage.
+
+* The contents of the input data are not altered by the sorting operation. * Pointers to contiguous memory must be used; iterators are not currently supported. * In-place operations are not supported. There must be no overlap between any of the provided ranges:
+
+* `[d_keys_in, d_keys_in + num_items)` * `[d_keys_out, d_keys_out + num_items)` * `[d_values_in, d_values_in + num_items)` * `[d_values_out, d_values_out + num_items)`
+
+* A bit subrange `[begin_bit, end_bit)` is provided to specify differentiating key bits. This can reduce overall sorting overhead and yield a corresponding performance improvement. * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see the sorting interface using DoubleBuffer wrappers below. * @devicestorage
+
+Let's consider a user-defined `custom_t` type below. To sort an array of `custom_t` objects, we have to tell CUB about relevant members of the `custom_t` type. We do this by providing a decomposer that returns a tuple of references to relevant members of the key.
+
+.. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu :language: c++ :dedent: :start-after: example-begin custom-type :end-before: example-end custom-type
+
+The following snippet shows how to sort an array of `custom_t` objects using `cub::DeviceRadixSort::SortPairsDescending`:
+
+.. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu :language: c++ :dedent: :start-after: example-begin pairs-descending-bits :end-before: example-end pairs-descending-bits
+
+
+```cpp showLineNumbers={false}
+template
+static ::cuda::std::enable_if_t, cudaError_t> cub::DeviceRadixSort::SortPairsDescending(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ const KeyT *d_keys_in,
+ KeyT *d_keys_out,
+ const ValueT *d_values_in,
+ ValueT *d_values_out,
+ NumItemsT num_items,
+ DecomposerT decomposer,
+ int begin_bit,
+ int end_bit,
+ cudaStream_t stream = 0
+)
+```
+
+
+**Template parameters**
+
+
+**[inferred]** KeyT type
+
+
+
+**[inferred]** ValueT type
+
+
+
+**[inferred]** Type of num_items
+
+
+
+**[inferred]** Type of a callable object responsible for decomposing a `KeyT` into a tuple of references to its constituent arithmetic types: `::cuda::std::tuple operator()(KeyT &key)`. The leftmost element of the tuple is considered the most significant. The call operator must not modify members of the key.
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Pointer to the input data of key data to sort
+
+
+
+Pointer to the sorted output sequence of key data
+
+
+
+Pointer to the corresponding input sequence of associated value items
+
+
+
+Pointer to the correspondingly-reordered output sequence of associated value items
+
+
+
+Number of items to sort
+
+
+
+Callable object responsible for decomposing a `KeyT` into a tuple of references to its constituent arithmetic types. The leftmost element of the tuple is considered the most significant. The call operator must not modify members of the key.
+
+
+
+**[optional]** The least-significant bit index (inclusive) needed for key comparison
+
+
+
+**[optional]** The most-significant bit index (exclusive) needed for key comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
+
+
+
+**[optional]** CUDA stream to launch kernels within. Default is stream0.
+
+
+
+
+
+Sorts key-value pairs into descending order using :math:`\approx 2N` auxiliary storage.
+
+* The contents of the input data are not altered by the sorting operation. * Pointers to contiguous memory must be used; iterators are not currently supported. * In-place operations are not supported. There must be no overlap between any of the provided ranges:
+
+* `[d_keys_in, d_keys_in + num_items)` * `[d_keys_out, d_keys_out + num_items)` * `[d_values_in, d_values_in + num_items)` * `[d_values_out, d_values_out + num_items)`
+
+* @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see the sorting interface using DoubleBuffer wrappers below. * @devicestorage
+
+Let's consider a user-defined `custom_t` type below. To sort an array of `custom_t` objects, we have to tell CUB about relevant members of the `custom_t` type. We do this by providing a decomposer that returns a tuple of references to relevant members of the key.
+
+.. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu :language: c++ :dedent: :start-after: example-begin custom-type :end-before: example-end custom-type
+
+The following snippet shows how to sort an array of `custom_t` objects using `cub::DeviceRadixSort::SortPairsDescending`:
+
+.. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu :language: c++ :dedent: :start-after: example-begin pairs-descending :end-before: example-end pairs-descending
+
+
+```cpp showLineNumbers={false}
+template
+static ::cuda::std::enable_if_t, cudaError_t> cub::DeviceRadixSort::SortPairsDescending(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ const KeyT *d_keys_in,
+ KeyT *d_keys_out,
+ const ValueT *d_values_in,
+ ValueT *d_values_out,
+ NumItemsT num_items,
+ DecomposerT decomposer,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+**Template parameters**
+
+
+**[inferred]** KeyT type
+
+
+
+**[inferred]** ValueT type
+
+
+
+**[inferred]** Type of num_items
+
+
+
+**[inferred]** Type of a callable object responsible for decomposing a `KeyT` into a tuple of references to its constituent arithmetic types: `::cuda::std::tuple operator()(KeyT &key)`. The leftmost element of the tuple is considered the most significant. The call operator must not modify members of the key.
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Pointer to the input data of key data to sort
+
+
+
+Pointer to the sorted output sequence of key data
+
+
+
+Pointer to the corresponding input sequence of associated value items
+
+
+
+Pointer to the correspondingly-reordered output sequence of associated value items
+
+
+
+Number of items to sort
+
+
+
+Callable object responsible for decomposing a `KeyT` into a tuple of references to its constituent arithmetic types. The leftmost element of the tuple is considered the most significant. The call operator must not modify members of the key.
+
+
+
+**[optional]** CUDA stream to launch kernels within. Default is stream0.
+
+
+
+
+
+Sorts key-value pairs into descending order using :math:`\approx N` auxiliary storage.
+
+The code snippet below illustrates the sorting of a device vector of `int` keys with associated vector of `int` values.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceRadixSort::SortPairsDescending(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ DoubleBuffer &d_keys,
+ DoubleBuffer &d_values,
+ NumItemsT num_items,
+ int begin_bit = 0,
+ int end_bit = sizeof(KeyT) *8,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The sorting operation is given a pair of key buffers and a corresponding pair of associated value buffers. Each pair is managed by a DoubleBuffer structure that indicates which of the two buffers is "current" (and thus contains the input data to be sorted).
+The contents of both buffers within each pair may be altered by the sorting operation.
+In-place operations are not supported. There must be no overlap between any of the provided ranges:
+`[d_keys.Current(), d_keys.Current() + num_items)`
+`[d_keys.Alternate(), d_keys.Alternate() + num_items)`
+`[d_values.Current(), d_values.Current() + num_items)`
+`[d_values.Alternate(), d_values.Alternate() + num_items)`
+Upon completion, the sorting operation will update the "current" indicator within each DoubleBuffer wrapper to reference which of the two buffers now contains the sorted output sequence (a function of the number of key bits specified and the targeted device architecture).
+An optional bit subrange `[begin_bit, end_bit)` of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement.
+@devicestorageP
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** KeyT type
+
+
+
+**[inferred]** ValueT type
+
+
+
+**[inferred]** Type of num_items
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+
+
+
+Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
+
+
+
+Number of items to sort
+
+
+
+**[optional]** The least-significant bit index (inclusive) needed for key comparison
+
+
+
+**[optional]** The most-significant bit index (exclusive) needed for key comparison (e.g., `sizeof(unsigned int) * 8`)
+
+
+
+**[optional]** CUDA stream to launch kernels within. Default is stream0.
+
+
+**Example**
+
+```cpp showLineNumbers={false}
+#include
+// or equivalently
+
+// Declare, allocate, and initialize device-accessible pointers
+// for sorting data
+int num_items; // e.g., 7
+int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9]
+int *d_key_alt_buf; // e.g., [ ... ]
+int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6]
+int *d_value_alt_buf; // e.g., [ ... ]
+...
+
+// Create a set of DoubleBuffers to wrap pairs of device pointers
+cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf);
+cub::DoubleBuffer d_values(d_value_buf, d_value_alt_buf);
+
+// Determine temporary device storage requirements
+void *d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+cub::DeviceRadixSort::SortPairsDescending(
+ d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
+
+// Allocate temporary storage
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Run sorting operation
+cub::DeviceRadixSort::SortPairsDescending(
+ d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
+
+// d_keys.Current() <-- [9, 8, 7, 6, 5, 3, 0]
+// d_values.Current() <-- [6, 0, 2, 1, 3, 4, 5]
+```
+
+
+
+
+nodiscard
+
+Sorts key-value pairs into descending order using :math:`\approx N` auxiliary storage.
+
+This is an environment-based API that allows customization of:
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceRadixSort::SortPairsDescending(
+ DoubleBuffer &d_keys,
+ DoubleBuffer &d_values,
+ NumItemsT num_items,
+ int begin_bit = 0,
+ int end_bit = sizeof(KeyT) *8,
+ EnvT env = {}
+)
+```
+
+
+*Added in v3.4.0. First appears in CUDA Toolkit 13.4.*
+
+
+Stream: Query via `cuda::get_stream`
+Memory resource: Query via `cuda::mr::get_memory_resource`
+The sorting operation is given a pair of key buffers and a corresponding pair of associated value buffers. Each pair is managed by a DoubleBuffer structure that indicates which of the two buffers is "current" (and thus contains the input data to be sorted).
+The contents of both buffers within each pair may be altered by the sorting operation.
+In-place operations are not supported. There must be no overlap between any of the provided ranges:
+`[d_keys.Current(), d_keys.Current() + num_items)`
+`[d_keys.Alternate(), d_keys.Alternate() + num_items)`
+`[d_values.Current(), d_values.Current() + num_items)`
+`[d_values.Alternate(), d_values.Alternate() + num_items)`
+Upon completion, the sorting operation will update the "current" indicator within each DoubleBuffer wrapper to reference which of the two buffers now contains the sorted output sequence (a function of the number of key bits specified and the targeted device architecture).
+An optional bit subrange `[begin_bit, end_bit)` of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement.
+
+
+**Template parameters**
+
+
+**[inferred]** KeyT type
+
+
+
+**[inferred]** ValueT type
+
+
+
+**[inferred]** Type of num_items
+
+
+
+**[inferred]** Environment type (e.g., `cuda::std::execution::env<...>`)
+
+
+**Parameters**
+
+
+Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+
+
+
+Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
+
+
+
+Number of items to sort
+
+
+
+The least-significant bit index (inclusive) needed for key comparison
+
+
+
+The most-significant bit index (exclusive) needed for key comparison (e.g., `sizeof(unsigned int) * 8`)
+
+
+
+**[optional]** Execution environment. Default is `cuda::std::execution::env{}`.
+
+
+
+
+
+Sorts key-value pairs into descending order using :math:`\approx N` auxiliary storage.
+
+* The sorting operation is given a pair of key buffers and a corresponding pair of associated value buffers. Each pair is managed by a DoubleBuffer structure that indicates which of the two buffers is "current" (and thus contains the input data to be sorted). * The contents of both buffers within each pair may be altered by the sorting operation. * In-place operations are not supported. There must be no overlap between any of the provided ranges:
+
+Let's consider a user-defined `custom_t` type below. To sort an array of `custom_t` objects, we have to tell CUB about relevant members of the `custom_t` type. We do this by providing a decomposer that returns a tuple of references to relevant members of the key.
+
+.. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu :language: c++ :dedent: :start-after: example-begin custom-type :end-before: example-end custom-type
+
+The following snippet shows how to sort an array of `custom_t` objects using `cub::DeviceRadixSort::SortPairsDescending`:
+
+.. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu :language: c++ :dedent: :start-after: example-begin pairs-descending-db :end-before: example-end pairs-descending-db
+
+
+```cpp showLineNumbers={false}
+template
+static ::cuda::std::enable_if_t, cudaError_t> cub::DeviceRadixSort::SortPairsDescending(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ DoubleBuffer &d_keys,
+ DoubleBuffer &d_values,
+ NumItemsT num_items,
+ DecomposerT decomposer,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+`[d_keys.Current(), d_keys.Current() + num_items)`
+`[d_keys.Alternate(), d_keys.Alternate() + num_items)`
+`[d_values.Current(), d_values.Current() + num_items)`
+`[d_values.Alternate(), d_values.Alternate() + num_items)`
+Upon completion, the sorting operation will update the "current" indicator within each DoubleBuffer wrapper to reference which of the two buffers now contains the sorted output sequence (a function of the number of key bits specified and the targeted device architecture).
+@devicestorageP
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** KeyT type
+
+
+
+**[inferred]** ValueT type
+
+
+
+**[inferred]** Type of num_items
+
+
+
+**[inferred]** Type of a callable object responsible for decomposing a `KeyT` into a tuple of references to its constituent arithmetic types: `::cuda::std::tuple operator()(KeyT &key)`. The leftmost element of the tuple is considered the most significant. The call operator must not modify members of the key.
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+
+
+
+Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
+
+
+
+Number of items to sort
+
+
+
+Callable object responsible for decomposing a `KeyT` into a tuple of references to its constituent arithmetic types. The leftmost element of the tuple is considered the most significant. The call operator must not modify members of the key.
+
+
+
+**[optional]** CUDA stream to launch kernels within. Default is stream0.
+
+
+
+
+
+Sorts key-value pairs into descending order using :math:`\approx N` auxiliary storage.
+
+* The sorting operation is given a pair of key buffers and a corresponding pair of associated value buffers. Each pair is managed by a DoubleBuffer structure that indicates which of the two buffers is "current" (and thus contains the input data to be sorted). * The contents of both buffers within each pair may be altered by the sorting operation. * In-place operations are not supported. There must be no overlap between any of the provided ranges:
+
+Let's consider a user-defined `custom_t` type below. To sort an array of `custom_t` objects, we have to tell CUB about relevant members of the `custom_t` type. We do this by providing a decomposer that returns a tuple of references to relevant members of the key.
+
+.. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu :language: c++ :dedent: :start-after: example-begin custom-type :end-before: example-end custom-type
+
+The following snippet shows how to sort an array of `custom_t` objects using `cub::DeviceRadixSort::SortPairsDescending`:
+
+.. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu :language: c++ :dedent: :start-after: example-begin pairs-descending-bits-db :end-before: example-end pairs-descending-bits-db
+
+
+```cpp showLineNumbers={false}
+template
+static ::cuda::std::enable_if_t, cudaError_t> cub::DeviceRadixSort::SortPairsDescending(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ DoubleBuffer &d_keys,
+ DoubleBuffer &d_values,
+ NumItemsT num_items,
+ DecomposerT decomposer,
+ int begin_bit,
+ int end_bit,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+`[d_keys.Current(), d_keys.Current() + num_items)`
+`[d_keys.Alternate(), d_keys.Alternate() + num_items)`
+`[d_values.Current(), d_values.Current() + num_items)`
+`[d_values.Alternate(), d_values.Alternate() + num_items)`
+Upon completion, the sorting operation will update the "current" indicator within each DoubleBuffer wrapper to reference which of the two buffers now contains the sorted output sequence (a function of the number of key bits specified and the targeted device architecture).
+An optional bit subrange `[begin_bit, end_bit)` of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement.
+@devicestorageP
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** KeyT type
+
+
+
+**[inferred]** ValueT type
+
+
+
+**[inferred]** Type of num_items
+
+
+
+**[inferred]** Type of a callable object responsible for decomposing a `KeyT` into a tuple of references to its constituent arithmetic types: `::cuda::std::tuple operator()(KeyT &key)`. The leftmost element of the tuple is considered the most significant. The call operator must not modify members of the key.
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+
+
+
+Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
+
+
+
+Number of items to sort
+
+
+
+Callable object responsible for decomposing a `KeyT` into a tuple of references to its constituent arithmetic types. The leftmost element of the tuple is considered the most significant. The call operator must not modify members of the key.
+
+
+
+**[optional]** The least-significant bit index (inclusive) needed for key comparison
+
+
+
+**[optional]** The most-significant bit index (exclusive) needed for key comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
+
+
+
+**[optional]** CUDA stream to launch kernels within. Default is stream0.
+
+
+
+
+
+---
+
+## Keys-only
+
+### SortKeys inline static
+
+
+
+
+Sorts keys into ascending order using :math:`\approx 2N` auxiliary storage.
+
+The code snippet below illustrates the sorting of a device vector of `int` keys.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceRadixSort::SortKeys(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ const KeyT *d_keys_in,
+ KeyT *d_keys_out,
+ NumItemsT num_items,
+ int begin_bit = 0,
+ int end_bit = sizeof(KeyT) *8,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The contents of the input data are not altered by the sorting operation.
+Pointers to contiguous memory must be used; iterators are not currently supported.
+In-place operations are not supported. There must be no overlap between any of the provided ranges:
+`[d_keys_in, d_keys_in + num_items)`
+`[d_keys_out, d_keys_out + num_items)`
+An optional bit subrange `[begin_bit, end_bit)` of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement.
+@devicestorageNP For sorting using only `O(P)` temporary storage, see the sorting interface using DoubleBuffer wrappers below.
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** KeyT type
+
+
+
+**[inferred]** Type of num_items
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Pointer to the input data of key data to sort
+
+
+
+Pointer to the sorted output sequence of key data
+
+
+
+Number of items to sort
+
+
+
+**[optional]** The least-significant bit index (inclusive) needed for key comparison
+
+
+
+**[optional]** The most-significant bit index (exclusive) needed for key comparison (e.g., `sizeof(unsigned int) * 8`)
+
+
+
+**[optional]** CUDA stream to launch kernels within. Default is stream0.
+
+
+**Example**
+
+```cpp showLineNumbers={false}
+#include
+// or equivalently
+
+// Declare, allocate, and initialize device-accessible pointers
+// for sorting data
+int num_items; // e.g., 7
+int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
+int *d_keys_out; // e.g., [ ... ]
+...
+
+// Determine temporary device storage requirements
+void *d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+cub::DeviceRadixSort::SortKeys(
+ d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items);
+
+// Allocate temporary storage
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Run sorting operation
+cub::DeviceRadixSort::SortKeys(
+ d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items);
+
+// d_keys_out <-- [0, 3, 5, 6, 7, 8, 9]
+```
+
+
+
+
+Sorts keys into ascending order using :math:`\approx 2N` auxiliary storage.
+
+* The contents of the input data are not altered by the sorting operation. * Pointers to contiguous memory must be used; iterators are not currently supported. * In-place operations are not supported. There must be no overlap between any of the provided ranges:
+
+* `[d_keys_in, d_keys_in + num_items)` * `[d_keys_out, d_keys_out + num_items)`
+
+* A bit subrange `[begin_bit, end_bit)` is provided to specify differentiating key bits. This can reduce overall sorting overhead and yield a corresponding performance improvement. * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see the sorting interface using DoubleBuffer wrappers below. * @devicestorage
+
+Let's consider a user-defined `custom_t` type below. To sort an array of `custom_t` objects, we have to tell CUB about relevant members of the `custom_t` type. We do this by providing a decomposer that returns a tuple of references to relevant members of the key.
+
+.. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu :language: c++ :dedent: :start-after: example-begin custom-type :end-before: example-end custom-type
+
+The following snippet shows how to sort an array of `custom_t` objects using `cub::DeviceRadixSort::SortKeys`:
+
+.. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu :language: c++ :dedent: :start-after: example-begin keys-bits :end-before: example-end keys-bits
+
+
+```cpp showLineNumbers={false}
+template
+static ::cuda::std::enable_if_t, cudaError_t> cub::DeviceRadixSort::SortKeys(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ const KeyT *d_keys_in,
+ KeyT *d_keys_out,
+ NumItemsT num_items,
+ DecomposerT decomposer,
+ int begin_bit,
+ int end_bit,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+**Template parameters**
+
+
+**[inferred]** KeyT type
+
+
+
+**[inferred]** Type of num_items
+
+
+
+**[inferred]** Type of a callable object responsible for decomposing a `KeyT` into a tuple of references to its constituent arithmetic types: `::cuda::std::tuple operator()(KeyT &key)`. The leftmost element of the tuple is considered the most significant. The call operator must not modify members of the key.
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Pointer to the input data of key data to sort
+
+
+
+Pointer to the sorted output sequence of key data
+
+
+
+Number of items to sort
+
+
+
+Callable object responsible for decomposing a `KeyT` into a tuple of references to its constituent arithmetic types. The leftmost element of the tuple is considered the most significant. The call operator must not modify members of the key.
+
+
+
+**[optional]** The least-significant bit index (inclusive) needed for key comparison
+
+
+
+**[optional]** The most-significant bit index (exclusive) needed for key comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
+
+
+
+**[optional]** CUDA stream to launch kernels within. Default is stream0.
+
+
+
+
+
+Sorts keys into ascending order using :math:`\approx 2N` auxiliary storage.
+
+* The contents of the input data are not altered by the sorting operation. * Pointers to contiguous memory must be used; iterators are not currently supported. * In-place operations are not supported. There must be no overlap between any of the provided ranges:
+
+* `[d_keys_in, d_keys_in + num_items)` * `[d_keys_out, d_keys_out + num_items)`
+
+* An optional bit subrange `[begin_bit, end_bit)` of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see the sorting interface using DoubleBuffer wrappers below. * @devicestorage
+
+Let's consider a user-defined `custom_t` type below. To sort an array of `custom_t` objects, we have to tell CUB about relevant members of the `custom_t` type. We do this by providing a decomposer that returns a tuple of references to relevant members of the key.
+
+.. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu :language: c++ :dedent: :start-after: example-begin custom-type :end-before: example-end custom-type
+
+The following snippet shows how to sort an array of `custom_t` objects using `cub::DeviceRadixSort::SortKeys`:
+
+.. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu :language: c++ :dedent: :start-after: example-begin keys :end-before: example-end keys
+
+
+```cpp showLineNumbers={false}
+template
+static ::cuda::std::enable_if_t, cudaError_t> cub::DeviceRadixSort::SortKeys(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ const KeyT *d_keys_in,
+ KeyT *d_keys_out,
+ NumItemsT num_items,
+ DecomposerT decomposer,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+**Template parameters**
+
+
+**[inferred]** KeyT type
+
+
+
+**[inferred]** Type of num_items
+
+
+
+**[inferred]** Type of a callable object responsible for decomposing a `KeyT` into a tuple of references to its constituent arithmetic types: `::cuda::std::tuple operator()(KeyT &key)`. The leftmost element of the tuple is considered the most significant. The call operator must not modify members of the key.
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Pointer to the input data of key data to sort
+
+
+
+Pointer to the sorted output sequence of key data
+
+
+
+Number of items to sort
+
+
+
+Callable object responsible for decomposing a `KeyT` into a tuple of references to its constituent arithmetic types. The leftmost element of the tuple is considered the most significant. The call operator must not modify members of the key.
+
+
+
+**[optional]** CUDA stream to launch kernels within. Default is stream0.
+
+
+
+
+
+Sorts keys into ascending order using :math:`\approx N` auxiliary storage.
+
+The code snippet below illustrates the sorting of a device vector of `int` keys.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceRadixSort::SortKeys(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ DoubleBuffer &d_keys,
+ NumItemsT num_items,
+ int begin_bit = 0,
+ int end_bit = sizeof(KeyT) *8,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The sorting operation is given a pair of key buffers managed by a DoubleBuffer structure that indicates which of the two buffers is "current" (and thus contains the input data to be sorted).
+The contents of both buffers may be altered by the sorting operation.
+In-place operations are not supported. There must be no overlap between any of the provided ranges:
+`[d_keys.Current(), d_keys.Current() + num_items)`
+`[d_keys.Alternate(), d_keys.Alternate() + num_items)`
+Upon completion, the sorting operation will update the "current" indicator within the DoubleBuffer wrapper to reference which of the two buffers now contains the sorted output sequence (a function of the number of key bits specified and the targeted device architecture).
+An optional bit subrange `[begin_bit, end_bit)` of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement.
+@devicestorageP
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** KeyT type
+
+
+
+**[inferred]** Type of num_items
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+
+
+
+Number of items to sort
+
+
+
+**[optional]** The least-significant bit index (inclusive) needed for key comparison
+
+
+
+**[optional]** The most-significant bit index (exclusive) needed for key comparison (e.g., `sizeof(unsigned int) * 8`)
+
+
+
+**[optional]** CUDA stream to launch kernels within. Default is stream0.
+
+
+**Example**
+
+```cpp showLineNumbers={false}
+#include
+// or equivalently
+
+// Declare, allocate, and initialize device-accessible pointers
+// for sorting data
+int num_items; // e.g., 7
+int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9]
+int *d_key_alt_buf; // e.g., [ ... ]
+...
+
+// Create a DoubleBuffer to wrap the pair of device pointers
+cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf);
+
+// Determine temporary device storage requirements
+void *d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+cub::DeviceRadixSort::SortKeys(
+ d_temp_storage, temp_storage_bytes, d_keys, num_items);
+
+// Allocate temporary storage
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Run sorting operation
+cub::DeviceRadixSort::SortKeys(
+ d_temp_storage, temp_storage_bytes, d_keys, num_items);
+
+// d_keys.Current() <-- [0, 3, 5, 6, 7, 8, 9]
+```
+
+
+
+
+Sorts keys into ascending order using :math:`\approx N` auxiliary storage.
+
+* The sorting operation is given a pair of key buffers managed by a DoubleBuffer structure that indicates which of the two buffers is "current" (and thus contains the input data to be sorted). * The contents of both buffers may be altered by the sorting operation. * In-place operations are not supported. There must be no overlap between any of the provided ranges:
+
+* `[d_keys.Current(), d_keys.Current() + num_items)` * `[d_keys.Alternate(), d_keys.Alternate() + num_items)`
+
+* Upon completion, the sorting operation will update the "current" indicator within the DoubleBuffer wrapper to reference which of the two buffers now contains the sorted output sequence (a function of the number of key bits specified and the targeted device architecture). * @devicestorageP * @devicestorage
+
+Let's consider a user-defined `custom_t` type below. To sort an array of `custom_t` objects, we have to tell CUB about relevant members of the `custom_t` type. We do this by providing a decomposer that returns a tuple of references to relevant members of the key.
+
+.. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu :language: c++ :dedent: :start-after: example-begin custom-type :end-before: example-end custom-type
+
+The following snippet shows how to sort an array of `custom_t` objects using `cub::DeviceRadixSort::SortKeys`:
+
+.. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu :language: c++ :dedent: :start-after: example-begin keys-db :end-before: example-end keys-db
+
+
+```cpp showLineNumbers={false}
+template
+static ::cuda::std::enable_if_t, cudaError_t> cub::DeviceRadixSort::SortKeys(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ DoubleBuffer &d_keys,
+ NumItemsT num_items,
+ DecomposerT decomposer,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+**Template parameters**
+
+
+**[inferred]** KeyT type
+
+
+
+**[inferred]** Type of num_items
+
+
+
+**[inferred]** Type of a callable object responsible for decomposing a `KeyT` into a tuple of references to its constituent arithmetic types: `::cuda::std::tuple operator()(KeyT &key)`. The leftmost element of the tuple is considered the most significant. The call operator must not modify members of the key.
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+
+
+
+Number of items to sort
+
+
+
+Callable object responsible for decomposing a `KeyT` into a tuple of references to its constituent arithmetic types. The leftmost element of the tuple is considered the most significant. The call operator must not modify members of the key.
+
+
+
+**[optional]** CUDA stream to launch kernels within. Default is stream0.
+
+
+
+
+
+Sorts keys into ascending order using :math:`\approx N` auxiliary storage.
+
+* The sorting operation is given a pair of key buffers managed by a DoubleBuffer structure that indicates which of the two buffers is "current" (and thus contains the input data to be sorted). * The contents of both buffers may be altered by the sorting operation. * In-place operations are not supported. There must be no overlap between any of the provided ranges:
+
+* `[d_keys.Current(), d_keys.Current() + num_items)` * `[d_keys.Alternate(), d_keys.Alternate() + num_items)`
+
+* A bit subrange `[begin_bit, end_bit)` is provided to specify differentiating key bits. This can reduce overall sorting overhead and yield a corresponding performance improvement. * Upon completion, the sorting operation will update the "current" indicator within the DoubleBuffer wrapper to reference which of the two buffers now contains the sorted output sequence (a function of the number of key bits specified and the targeted device architecture). * @devicestorageP * @devicestorage
+
+Let's consider a user-defined `custom_t` type below. To sort an array of `custom_t` objects, we have to tell CUB about relevant members of the `custom_t` type. We do this by providing a decomposer that returns a tuple of references to relevant members of the key.
+
+.. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu :language: c++ :dedent: :start-after: example-begin custom-type :end-before: example-end custom-type
+
+The following snippet shows how to sort an array of `custom_t` objects using `cub::DeviceRadixSort::SortKeys`:
+
+.. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu :language: c++ :dedent: :start-after: example-begin keys-bits-db :end-before: example-end keys-bits-db
+
+
+```cpp showLineNumbers={false}
+template
+static ::cuda::std::enable_if_t, cudaError_t> cub::DeviceRadixSort::SortKeys(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ DoubleBuffer &d_keys,
+ NumItemsT num_items,
+ DecomposerT decomposer,
+ int begin_bit,
+ int end_bit,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+**Template parameters**
+
+
+**[inferred]** KeyT type
+
+
+
+**[inferred]** Type of num_items
+
+
+
+**[inferred]** Type of a callable object responsible for decomposing a `KeyT` into a tuple of references to its constituent arithmetic types: `::cuda::std::tuple operator()(KeyT &key)`. The leftmost element of the tuple is considered the most significant. The call operator must not modify members of the key.
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+
+
+
+Number of items to sort
+
+
+
+Callable object responsible for decomposing a `KeyT` into a tuple of references to its constituent arithmetic types. The leftmost element of the tuple is considered the most significant. The call operator must not modify members of the key.
+
+
+
+**[optional]** The least-significant bit index (inclusive) needed for key comparison
+
+
+
+**[optional]** The most-significant bit index (exclusive) needed for key comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
+
+
+
+**[optional]** CUDA stream to launch kernels within. Default is stream0.
+
+
+
+
+
+### SortKeysDescending inline static
+
+
+
+
+Sorts keys into descending order using :math:`\approx 2N` auxiliary storage.
+
+The code snippet below illustrates the sorting of a device vector of `int` keys.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceRadixSort::SortKeysDescending(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ const KeyT *d_keys_in,
+ KeyT *d_keys_out,
+ NumItemsT num_items,
+ int begin_bit = 0,
+ int end_bit = sizeof(KeyT) *8,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The contents of the input data are not altered by the sorting operation.
+Pointers to contiguous memory must be used; iterators are not currently supported.
+In-place operations are not supported. There must be no overlap between any of the provided ranges:
+`[d_keys_in, d_keys_in + num_items)`
+`[d_keys_out, d_keys_out + num_items)`
+An optional bit subrange `[begin_bit, end_bit)` of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement.
+@devicestorageNP For sorting using only `O(P)` temporary storage, see the sorting interface using DoubleBuffer wrappers below.
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** KeyT type
+
+
+
+**[inferred]** Type of num_items
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Pointer to the input data of key data to sort
+
+
+
+Pointer to the sorted output sequence of key data
+
+
+
+Number of items to sort
+
+
+
+**[optional]** The least-significant bit index (inclusive) needed for key comparison
+
+
+
+**[optional]** The most-significant bit index (exclusive) needed for key comparison (e.g., `sizeof(unsigned int) * 8`)
+
+
+
+**[optional]** CUDA stream to launch kernels within. Default is stream0.
+
+
+**Example**
+
+```cpp showLineNumbers={false}
+#include
+// or equivalently
+
+// Declare, allocate, and initialize device-accessible pointers
+// for sorting data
+int num_items; // e.g., 7
+int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
+int *d_keys_out; // e.g., [ ... ]
+...
+
+// Create a DoubleBuffer to wrap the pair of device pointers
+cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf);
+
+// Determine temporary device storage requirements
+void *d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+cub::DeviceRadixSort::SortKeysDescending(
+ d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items);
+
+// Allocate temporary storage
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Run sorting operation
+cub::DeviceRadixSort::SortKeysDescending(
+ d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items);
+
+// d_keys_out <-- [9, 8, 7, 6, 5, 3, 0]s
+```
+
+
+
+
+Sorts keys into descending order using :math:`\approx 2N` auxiliary storage.
+
+* The contents of the input data are not altered by the sorting operation. * Pointers to contiguous memory must be used; iterators are not currently supported. * In-place operations are not supported. There must be no overlap between any of the provided ranges:
+
+* `[d_keys_in, d_keys_in + num_items)` * `[d_keys_out, d_keys_out + num_items)`
+
+* An optional bit subrange `[begin_bit, end_bit)` of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see the sorting interface using DoubleBuffer wrappers below. * @devicestorage
+
+Let's consider a user-defined `custom_t` type below. To sort an array of `custom_t` objects, we have to tell CUB about relevant members of the `custom_t` type. We do this by providing a decomposer that returns a tuple of references to relevant members of the key.
+
+.. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu :language: c++ :dedent: :start-after: example-begin custom-type :end-before: example-end custom-type
+
+The following snippet shows how to sort an array of `custom_t` objects using `cub::DeviceRadixSort::SortKeysDescending`:
+
+.. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu :language: c++ :dedent: :start-after: example-begin keys-descending-bits :end-before: example-end keys-descending-bits
+
+
+```cpp showLineNumbers={false}
+template
+static ::cuda::std::enable_if_t, cudaError_t> cub::DeviceRadixSort::SortKeysDescending(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ const KeyT *d_keys_in,
+ KeyT *d_keys_out,
+ NumItemsT num_items,
+ DecomposerT decomposer,
+ int begin_bit,
+ int end_bit,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+**Template parameters**
+
+
+**[inferred]** KeyT type
+
+
+
+**[inferred]** Type of num_items
+
+
+
+**[inferred]** Type of a callable object responsible for decomposing a `KeyT` into a tuple of references to its constituent arithmetic types: `::cuda::std::tuple operator()(KeyT &key)`. The leftmost element of the tuple is considered the most significant. The call operator must not modify members of the key.
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Pointer to the input data of key data to sort
+
+
+
+Pointer to the sorted output sequence of key data
+
+
+
+Number of items to sort
+
+
+
+Callable object responsible for decomposing a `KeyT` into a tuple of references to its constituent arithmetic types. The leftmost element of the tuple is considered the most significant. The call operator must not modify members of the key.
+
+
+
+**[optional]** The least-significant bit index (inclusive) needed for key comparison
+
+
+
+**[optional]** The most-significant bit index (exclusive) needed for key comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
+
+
+
+**[optional]** CUDA stream to launch kernels within. Default is stream0.
+
+
+
+
+
+Sorts keys into descending order using :math:`\approx 2N` auxiliary storage.
+
+* The contents of the input data are not altered by the sorting operation. * Pointers to contiguous memory must be used; iterators are not currently supported. * In-place operations are not supported. There must be no overlap between any of the provided ranges:
+
+* `[d_keys_in, d_keys_in + num_items)` * `[d_keys_out, d_keys_out + num_items)`
+
+* @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see the sorting interface using DoubleBuffer wrappers below. * @devicestorage
+
+Let's consider a user-defined `custom_t` type below. To sort an array of `custom_t` objects, we have to tell CUB about relevant members of the `custom_t` type. We do this by providing a decomposer that returns a tuple of references to relevant members of the key.
+
+.. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu :language: c++ :dedent: :start-after: example-begin custom-type :end-before: example-end custom-type
+
+The following snippet shows how to sort an array of `custom_t` objects using `cub::DeviceRadixSort::SortKeysDescending`:
+
+.. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu :language: c++ :dedent: :start-after: example-begin keys-descending :end-before: example-end keys-descending
+
+
+```cpp showLineNumbers={false}
+template
+static ::cuda::std::enable_if_t, cudaError_t> cub::DeviceRadixSort::SortKeysDescending(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ const KeyT *d_keys_in,
+ KeyT *d_keys_out,
+ NumItemsT num_items,
+ DecomposerT decomposer,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+**Template parameters**
+
+
+**[inferred]** KeyT type
+
+
+
+**[inferred]** Type of num_items
+
+
+
+**[inferred]** Type of a callable object responsible for decomposing a `KeyT` into a tuple of references to its constituent arithmetic types: `::cuda::std::tuple operator()(KeyT &key)`. The leftmost element of the tuple is considered the most significant. The call operator must not modify members of the key.
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Pointer to the input data of key data to sort
+
+
+
+Pointer to the sorted output sequence of key data
+
+
+
+Number of items to sort
+
+
+
+Callable object responsible for decomposing a `KeyT` into a tuple of references to its constituent arithmetic types. The leftmost element of the tuple is considered the most significant. The call operator must not modify members of the key.
+
+
+
+**[optional]** CUDA stream to launch kernels within. Default is stream0.
+
+
+
+
+
+Sorts keys into descending order using :math:`\approx N` auxiliary storage.
+
+The code snippet below illustrates the sorting of a device vector of `int` keys.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceRadixSort::SortKeysDescending(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ DoubleBuffer &d_keys,
+ NumItemsT num_items,
+ int begin_bit = 0,
+ int end_bit = sizeof(KeyT) *8,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The sorting operation is given a pair of key buffers managed by a DoubleBuffer structure that indicates which of the two buffers is "current" (and thus contains the input data to be sorted).
+The contents of both buffers may be altered by the sorting operation.
+In-place operations are not supported. There must be no overlap between any of the provided ranges:
+`[d_keys.Current(), d_keys.Current() + num_items)`
+`[d_keys.Alternate(), d_keys.Alternate() + num_items)`
+Upon completion, the sorting operation will update the "current" indicator within the DoubleBuffer wrapper to reference which of the two buffers now contains the sorted output sequence (a function of the number of key bits specified and the targeted device architecture).
+An optional bit subrange `[begin_bit, end_bit)` of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement.
+@devicestorageP
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** KeyT type
+
+
+
+**[inferred]** Type of num_items
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+
+
+
+Number of items to sort
+
+
+
+**[optional]** The least-significant bit index (inclusive) needed for key comparison
+
+
+
+**[optional]** The most-significant bit index (exclusive) needed for key comparison (e.g., `sizeof(unsigned int) * 8`)
+
+
+
+**[optional]** CUDA stream to launch kernels within. Default is stream0.
+
+
+**Example**
+
+```cpp showLineNumbers={false}
+#include
+// or equivalently
+
+// Declare, allocate, and initialize device-accessible pointers
+// for sorting data
+int num_items; // e.g., 7
+int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9]
+int *d_key_alt_buf; // e.g., [ ... ]
+...
+
+// Create a DoubleBuffer to wrap the pair of device pointers
+cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf);
+
+// Determine temporary device storage requirements
+void *d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+cub::DeviceRadixSort::SortKeysDescending(
+ d_temp_storage, temp_storage_bytes, d_keys, num_items);
+
+// Allocate temporary storage
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Run sorting operation
+cub::DeviceRadixSort::SortKeysDescending(
+ d_temp_storage, temp_storage_bytes, d_keys, num_items);
+
+// d_keys.Current() <-- [9, 8, 7, 6, 5, 3, 0]
+```
+
+
+
+
+Sorts keys into descending order using :math:`\approx N` auxiliary storage.
+
+* The sorting operation is given a pair of key buffers managed by a DoubleBuffer structure that indicates which of the two buffers is "current" (and thus contains the input data to be sorted). * The contents of both buffers may be altered by the sorting operation. * In-place operations are not supported. There must be no overlap between any of the provided ranges:
+
+* `[d_keys.Current(), d_keys.Current() + num_items)` * `[d_keys.Alternate(), d_keys.Alternate() + num_items)`
+
+* Upon completion, the sorting operation will update the "current" indicator within the DoubleBuffer wrapper to reference which of the two buffers now contains the sorted output sequence (a function of the number of key bits specified and the targeted device architecture). * @devicestorageP * @devicestorage
+
+Let's consider a user-defined `custom_t` type below. To sort an array of `custom_t` objects, we have to tell CUB about relevant members of the `custom_t` type. We do this by providing a decomposer that returns a tuple of references to relevant members of the key.
+
+.. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu :language: c++ :dedent: :start-after: example-begin custom-type :end-before: example-end custom-type
+
+The following snippet shows how to sort an array of `custom_t` objects using `cub::DeviceRadixSort::SortKeysDescending`:
+
+.. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu :language: c++ :dedent: :start-after: example-begin keys-descending-db :end-before: example-end keys-descending-db
+
+
+```cpp showLineNumbers={false}
+template
+static ::cuda::std::enable_if_t, cudaError_t> cub::DeviceRadixSort::SortKeysDescending(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ DoubleBuffer &d_keys,
+ NumItemsT num_items,
+ DecomposerT decomposer,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+**Template parameters**
+
+
+**[inferred]** KeyT type
+
+
+
+**[inferred]** Type of num_items
+
+
+
+**[inferred]** Type of a callable object responsible for decomposing a `KeyT` into a tuple of references to its constituent arithmetic types: `::cuda::std::tuple operator()(KeyT &key)`. The leftmost element of the tuple is considered the most significant. The call operator must not modify members of the key.
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+
+
+
+Number of items to sort
+
+
+
+Callable object responsible for decomposing a `KeyT` into a tuple of references to its constituent arithmetic types. The leftmost element of the tuple is considered the most significant. The call operator must not modify members of the key.
+
+
+
+**[optional]** CUDA stream to launch kernels within. Default is stream0.
+
+
+
+
+
+Sorts keys into descending order using :math:`\approx N` auxiliary storage.
+
+* The sorting operation is given a pair of key buffers managed by a DoubleBuffer structure that indicates which of the two buffers is "current" (and thus contains the input data to be sorted). * The contents of both buffers may be altered by the sorting operation. * In-place operations are not supported. There must be no overlap between any of the provided ranges:
+
+* `[d_keys.Current(), d_keys.Current() + num_items)` * `[d_keys.Alternate(), d_keys.Alternate() + num_items)`
+
+* A bit subrange `[begin_bit, end_bit)` is provided to specify differentiating key bits. This can reduce overall sorting overhead and yield a corresponding performance improvement. * Upon completion, the sorting operation will update the "current" indicator within the DoubleBuffer wrapper to reference which of the two buffers now contains the sorted output sequence (a function of the number of key bits specified and the targeted device architecture). * @devicestorageP * @devicestorage
+
+Let's consider a user-defined `custom_t` type below. To sort an array of `custom_t` objects, we have to tell CUB about relevant members of the `custom_t` type. We do this by providing a decomposer that returns a tuple of references to relevant members of the key.
+
+.. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu :language: c++ :dedent: :start-after: example-begin custom-type :end-before: example-end custom-type
+
+The following snippet shows how to sort an array of `custom_t` objects using `cub::DeviceRadixSort::SortKeysDescending`:
+
+.. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu :language: c++ :dedent: :start-after: example-begin keys-descending-bits-db :end-before: example-end keys-descending-bits-db
+
+
+```cpp showLineNumbers={false}
+template
+static ::cuda::std::enable_if_t, cudaError_t> cub::DeviceRadixSort::SortKeysDescending(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ DoubleBuffer &d_keys,
+ NumItemsT num_items,
+ DecomposerT decomposer,
+ int begin_bit,
+ int end_bit,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+**Template parameters**
+
+
+**[inferred]** KeyT type
+
+
+
+**[inferred]** Type of num_items
+
+
+
+**[inferred]** Type of a callable object responsible for decomposing a `KeyT` into a tuple of references to its constituent arithmetic types: `::cuda::std::tuple operator()(KeyT &key)`. The leftmost element of the tuple is considered the most significant. The call operator must not modify members of the key.
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+
+
+
+Number of items to sort
+
+
+
+Callable object responsible for decomposing a `KeyT` into a tuple of references to its constituent arithmetic types. The leftmost element of the tuple is considered the most significant. The call operator must not modify members of the key.
+
+
+
+**[optional]** The least-significant bit index (inclusive) needed for key comparison
+
+
+
+**[optional]** The most-significant bit index (exclusive) needed for key comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
+
+
+
+**[optional]** CUDA stream to launch kernels within. Default is stream0.
+
+
+
+
+
+---
+
+## Utility methods
+
+### custom_radix_sort inline static
+
+
+
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceRadixSort::custom_radix_sort(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ bool is_overwrite_okay,
+ DoubleBuffer &d_keys,
+ DoubleBuffer &d_values,
+ OffsetT num_items,
+ DecomposerT decomposer,
+ int begin_bit,
+ int end_bit,
+ cudaStream_t stream
+)
+```
+
+
+
+
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceRadixSort::custom_radix_sort(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ bool is_overwrite_okay,
+ DoubleBuffer &d_keys,
+ DoubleBuffer &d_values,
+ OffsetT num_items,
+ DecomposerT decomposer,
+ cudaStream_t stream
+)
+```
+
+
+
+
+
+### GetName inline static constexpr
+
+
+```cpp showLineNumbers={false}
+static constexpr const char * cub::DeviceRadixSort::GetName()
+```
+
diff --git a/fern/cudapages/cub/cub/cub/DeviceReduce.mdx b/fern/cudapages/cub/cub/cub/DeviceReduce.mdx
new file mode 100644
index 0000000..edc92fc
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/DeviceReduce.mdx
@@ -0,0 +1,1750 @@
+---
+title: cub::DeviceReduce
+description: ""
+---
+
+DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of data items residing within device-accessible memory.
+
+
+
+A [reduction](http://en.wikipedia.org/wiki/Reduce_(higher-order_function)) (or *fold*) uses a binary combining operator to compute a single aggregate from a sequence of input elements.
+
+
+@cdp_class{DeviceReduce}
+
+
+@linear_performance{reduction, reduce-by-key, and run-length encode}
+
+---
+
+## Methods
+
+### reduce_impl inline static
+
+
+
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceReduce::reduce_impl(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ InputIteratorT d_in,
+ OutputIteratorT d_out,
+ NumItemsT num_items,
+ ReductionOpT reduction_op,
+ TransformOpT transform_op,
+ T init,
+ ::cuda::execution::determinism::__determinism_holder_t,
+ cudaStream_t stream
+)
+```
+
+
+
+
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceReduce::reduce_impl(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ InputIteratorT d_in,
+ OutputIteratorT d_out,
+ NumItemsT num_items,
+ ReductionOpT,
+ TransformOpT transform_op,
+ T init,
+ ::cuda::execution::determinism::gpu_to_gpu_t,
+ cudaStream_t stream
+)
+```
+
+
+
+
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceReduce::reduce_impl(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ InputIteratorT d_in,
+ OutputIteratorT d_out,
+ NumItemsT num_items,
+ ReductionOpT reduction_op,
+ TransformOpT transform_op,
+ T init,
+ ::cuda::execution::determinism::not_guaranteed_t,
+ cudaStream_t stream
+)
+```
+
+
+
+
+
+---
+
+## Static methods
+
+### Reduce inline static
+
+
+
+
+Computes a device-wide reduction using the specified binary `reduction_op` functor and initial value `init`.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceReduce::Reduce(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ InputIteratorT d_in,
+ OutputIteratorT d_out,
+ NumItemsT num_items,
+ ReductionOpT reduction_op,
+ T init,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+Does not support binary reduction operators that are non-commutative.
+Provides "run-to-run" determinism for pseudo-associative reduction (e.g., addition of floating point types) on the same GPU device. However, results for pseudo-associative reduction may be inconsistent from one device to a another device of a different compute-capability because CUB can employ different tile-sizing for different architectures.
+The range `[d_in, d_in + num_items)` shall not overlap `d_out`.
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access input iterator type for reading input items (may be a simple pointer type)
+
+
+
+**[inferred]** Output iterator type for recording the reduced aggregate (may be a simple pointer type)
+
+
+
+**[inferred]** Binary reduction functor type having member `T operator()(const T &a, const T &b)`
+
+
+
+**[inferred]** Data element type that is convertible to the `value` type of `InputIteratorT`
+
+
+
+**[inferred]** Type of num_items
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Pointer to the input sequence of data items
+
+
+
+Pointer to the output aggregate
+
+
+
+Total number of input items (i.e., length of `d_in`)
+
+
+
+Binary reduction functor
+
+
+
+Initial value of the reduction
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+**Example**
+
+The code snippet below illustrates a user-defined min-reduction of a device vector of `int` data elements.
+
+```cpp showLineNumbers={false}
+#include
+// or equivalently
+
+// CustomMin functor
+struct CustomMin
+{
+ template
+ __device__ __forceinline__
+ T operator()(const T &a, const T &b) const {
+ return (b < a) ? b : a;
+ }
+};
+
+// Declare, allocate, and initialize device-accessible pointers for
+// input and output
+int num_items; // e.g., 7
+int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
+int *d_out; // e.g., [-]
+CustomMin min_op;
+int init; // e.g., INT_MAX
+...
+
+// Determine temporary device storage requirements
+void *d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+cub::DeviceReduce::Reduce(
+ d_temp_storage, temp_storage_bytes,
+ d_in, d_out, num_items, min_op, init);
+
+// Allocate temporary storage
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Run reduction
+cub::DeviceReduce::Reduce(
+ d_temp_storage, temp_storage_bytes,
+ d_in, d_out, num_items, min_op, init);
+
+// d_out <-- [0]
+```
+
+
+
+
+nodiscard
+
+Computes a device-wide reduction using the specified binary `reduction_op` functor and initial value `init`.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceReduce::Reduce(
+ InputIteratorT d_in,
+ OutputIteratorT d_out,
+ NumItemsT num_items,
+ ReductionOpT reduction_op,
+ T init,
+ EnvT env = {}
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+Does not support binary reduction operators that are non-commutative.
+By default, provides "run-to-run" determinism for pseudo-associative reduction (e.g., addition of floating point types) on the same GPU device. However, results for pseudo-associative reduction may be inconsistent from one device to a another device of a different compute-capability because CUB can employ different tile-sizing for different architectures. To request "gpu-to-gpu" determinism, pass `cuda::execution::require(cuda::execution::determinism::gpu_to_gpu)` as the `env` parameter. To request "not-guaranteed" determinism, pass `cuda::execution::require(cuda::execution::determinism::not_guaranteed)` as the `env` parameter.
+The range `[d_in, d_in + num_items)` shall not overlap `d_out`.
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access input iterator type for reading input items (may be a simple pointer type)
+
+
+
+**[inferred]** Output iterator type for recording the reduced aggregate (may be a simple pointer type)
+
+
+
+**[inferred]** Binary reduction functor type having member `T operator()(const T &a, const T &b)`
+
+
+
+**[inferred]** Data element type that is convertible to the `value` type of `InputIteratorT`
+
+
+
+**[inferred]** Type of num_items
+
+
+
+**[inferred]** Execution environment type. Default is `cuda::std::execution::env<>`.
+
+
+**Parameters**
+
+
+Pointer to the input sequence of data items
+
+
+
+Pointer to the output aggregate
+
+
+
+Total number of input items (i.e., length of `d_in`)
+
+
+
+Binary reduction functor
+
+
+
+Initial value of the reduction
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``.
+//!
+
+
+
+
+
+### Sum inline static
+
+
+
+
+nodiscard
+
+Computes a device-wide sum using the addition (`+`) operator.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceReduce::Sum(
+ InputIteratorT d_in,
+ OutputIteratorT d_out,
+ NumItemsT num_items,
+ EnvT env = {}
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+Uses `0` as the initial value of the reduction.
+Does not support `+` operators that are non-commutative.
+Provides "run-to-run" determinism for pseudo-associative reduction (e.g., addition of floating point types) on the same GPU device. However, results for pseudo-associative reduction may be inconsistent from one device to a another device of a different compute-capability because CUB can employ different tile-sizing for different architectures. To request "gpu-to-gpu" determinism, pass `cuda::execution::require(cuda::execution::determinism::gpu_to_gpu)` as the `env` parameter. To request "not-guaranteed" determinism, pass `cuda::execution::require(cuda::execution::determinism::not_guaranteed)` as the `env` parameter.
+The range `[d_in, d_in + num_items)` shall not overlap `d_out`.
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access input iterator type for reading input items (may be a simple pointer type)
+
+
+
+**[inferred]** Output iterator type for recording the reduced aggregate (may be a simple pointer type)
+
+
+
+**[inferred]** Type of num_items
+
+
+
+**[inferred]** Execution environment type. Default is `cuda::std::execution::env<>`.
+
+
+**Parameters**
+
+
+Pointer to the input sequence of data items
+
+
+
+Pointer to the output aggregate
+
+
+
+Total number of input items (i.e., length of `d_in`)
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** Execution environment. Default is `cuda::std::execution::env{}`.
+//!
+
+
+
+
+
+Computes a device-wide sum using the addition (`+`) operator.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceReduce::Sum(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ InputIteratorT d_in,
+ OutputIteratorT d_out,
+ NumItemsT num_items,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+Uses `0` as the initial value of the reduction.
+Does not support `+` operators that are non-commutative.
+Provides "run-to-run" determinism for pseudo-associative reduction (e.g., addition of floating point types) on the same GPU device. However, results for pseudo-associative reduction may be inconsistent from one device to a another device of a different compute-capability because CUB can employ different tile-sizing for different architectures.
+The range `[d_in, d_in + num_items)` shall not overlap `d_out`.
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access input iterator type for reading input items (may be a simple pointer type)
+
+
+
+**[inferred]** Output iterator type for recording the reduced aggregate (may be a simple pointer type)
+
+
+
+**[inferred]** Type of num_items
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Pointer to the input sequence of data items
+
+
+
+Pointer to the output aggregate
+
+
+
+Total number of input items (i.e., length of `d_in`)
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+**Example**
+
+The code snippet below illustrates the sum-reduction of a device vector of `int` data elements.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+// Declare, allocate, and initialize device-accessible pointers
+// for input and output
+int num_items; // e.g., 7
+int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
+int *d_out; // e.g., [-]
+...
+
+// Determine temporary device storage requirements
+void *d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+cub::DeviceReduce::Sum(
+ d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
+
+// Allocate temporary storage
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Run sum-reduction
+cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
+
+// d_out <-- [38]
+```
+
+
+
+
+### Min inline static
+
+
+
+
+Computes a device-wide minimum using the less-than (`<`) operator.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceReduce::Min(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ InputIteratorT d_in,
+ OutputIteratorT d_out,
+ NumItemsT num_items,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+Uses `cuda::std::numeric_limits::max()` as the initial value of the reduction.
+Does not support `<` operators that are non-commutative.
+Provides "run-to-run" determinism for pseudo-associative reduction (e.g., addition of floating point types) on the same GPU device. However, results for pseudo-associative reduction may be inconsistent from one device to a another device of a different compute-capability because CUB can employ different tile-sizing for different architectures.
+The range `[d_in, d_in + num_items)` shall not overlap `d_out`.
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access input iterator type for reading input items (may be a simple pointer type)
+
+
+
+**[inferred]** Output iterator type for recording the reduced aggregate (may be a simple pointer type)
+
+
+
+**[inferred]** Type of num_items
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Pointer to the input sequence of data items
+
+
+
+Pointer to the output aggregate
+
+
+
+Total number of input items (i.e., length of `d_in`)
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+**Example**
+
+The code snippet below illustrates the min-reduction of a device vector of `int` data elements.
+
+```cpp showLineNumbers={false}
+#include
+// or equivalently
+
+// Declare, allocate, and initialize device-accessible pointers
+// for input and output
+int num_items; // e.g., 7
+int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
+int *d_out; // e.g., [-]
+...
+
+// Determine temporary device storage requirements
+void *d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+cub::DeviceReduce::Min(
+ d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
+
+// Allocate temporary storage
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Run min-reduction
+cub::DeviceReduce::Min(
+ d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
+
+// d_out <-- [0]
+```
+
+
+
+
+nodiscard
+
+Computes a device-wide minimum using the less-than (`<`) operator. The result is written to the output iterator.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceReduce::Min(
+ InputIteratorT d_in,
+ OutputIteratorT d_out,
+ NumItemsT num_items,
+ EnvT env = {}
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+Uses `cuda::std::numeric_limits::max()` as the initial value of the reduction.
+Provides determinism based on the environment's determinism requirements. To request "run-to-run" determinism, pass `cuda::execution::require(cuda::execution::determinism::run_to_run)` as the `env` parameter.
+The range `[d_in, d_in + num_items)` shall not overlap `d_out`.
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access input iterator type for reading input items (may be a simple pointer type)
+
+
+
+**[inferred]** Output iterator type for recording the reduced aggregate (may be a simple pointer type)
+
+
+
+**[inferred]** Type of num_items
+
+
+
+**[inferred]** Execution environment type. Default is `cuda::std::execution::env<>`.
+
+
+**Parameters**
+
+
+Pointer to the input sequence of data items
+
+
+
+Pointer to the output aggregate
+
+
+
+Total number of input items (i.e., length of `d_in`)
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** Execution environment. Default is `cuda::std::execution::env{}`.
+//!
+
+
+
+
+
+### ArgMin inline static
+
+
+
+
+Finds the first device-wide minimum using the less-than (`<`) operator and also returns the index of that item.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceReduce::ArgMin(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ InputIteratorT d_in,
+ ExtremumOutIteratorT d_min_out,
+ IndexOutIteratorT d_index_out,
+ ::cuda::std::int64_t num_items,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The minimum is written to `d_min_out`
+The offset of the returned item is written to `d_index_out`, the offset type being written is of type `cuda::std::int64_t`.
+For zero-length inputs, `cuda::std::numeric_limits::max()}` is written to `d_min_out` and the index `1` is written to `d_index_out`.
+Does not support `<` operators that are non-commutative.
+Provides "run-to-run" determinism for pseudo-associative reduction (e.g., addition of floating point types) on the same GPU device. However, results for pseudo-associative reduction may be inconsistent from one device to a another device of a different compute-capability because CUB can employ different tile-sizing for different architectures.
+The range `[d_in, d_in + num_items)` shall not overlap `d_min_out` nor `d_index_out`.
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access input iterator type for reading input items (of some type `T`) (may be a simple pointer type)
+
+
+
+**[inferred]** Output iterator type for recording minimum value
+
+
+
+**[inferred]** Output iterator type for recording index of the returned value
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Iterator to the input sequence of data items
+
+
+
+Iterator to which the minimum value is written
+
+
+
+Iterator to which the index of the returned value is written
+
+
+
+Total number of input items (i.e., length of `d_in`)
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+**Example**
+
+The code snippet below illustrates the argmin-reduction of a device vector of `int` data elements.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+#include
+
+// Declare, allocate, and initialize device-accessible pointers
+// for input and output
+int num_items; // e.g., 7
+int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
+int *d_min_out; // memory for the minimum value
+cuda::std::int64_t *d_index_out; // memory for the index of the returned value
+...
+
+// Determine temporary device storage requirements
+void *d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_min_out, d_index_out,
+num_items);
+
+// Allocate temporary storage
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Run argmin-reduction
+cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_min_out, d_index_out,
+num_items);
+
+// d_min_out <-- 0
+// d_index_out <-- 5
+```
+
+
+
+
+nodiscard
+
+Finds the first device-wide minimum using the less-than (`<`) operator and also returns the index of that item.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceReduce::ArgMin(
+ InputIteratorT d_in,
+ ExtremumOutIteratorT d_min_out,
+ IndexOutIteratorT d_index_out,
+ ::cuda::std::int64_t num_items,
+ EnvT env = {}
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The minimum is written to `d_min_out`
+The offset of the returned item is written to `d_index_out`, the offset type being written is of type `cuda::std::int64_t`.
+For zero-length inputs, `cuda::std::numeric_limits::max()}` is written to `d_min_out` and the index `1` is written to `d_index_out`.
+Does not support `<` operators that are non-commutative.
+Provides determinism based on the environment's determinism requirements. To request "run-to-run" determinism, pass `cuda::execution::require(cuda::execution::determinism::run_to_run)` as the `env` parameter.
+The range `[d_in, d_in + num_items)` shall not overlap `d_min_out` nor `d_index_out`.
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access input iterator type for reading input items (of some type `T`) (may be a simple pointer type)
+
+
+
+**[inferred]** Output iterator type for recording minimum value
+
+
+
+**[inferred]** Output iterator type for recording index of the returned value
+
+
+
+**[inferred]** Execution environment type. Default is `cuda::std::execution::env<>`.
+
+
+**Parameters**
+
+
+Iterator to the input sequence of data items
+
+
+
+Iterator to which the minimum value is written
+
+
+
+Iterator to which the index of the returned value is written
+
+
+
+Total number of input items (i.e., length of `d_in`)
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``.
+//!
+
+
+
+
+
+Finds the first device-wide minimum using the less-than (`<`) operator, also returning the index of that item.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceReduce::ArgMin(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ InputIteratorT d_in,
+ OutputIteratorT d_out,
+ int num_items,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The output value type of `d_out` is `cub::KeyValuePair` (assuming the value type of `d_in` is `T`)
+The minimum is written to `d_out.value` and its offset in the input array is written to `d_out.key`.
+The `{1, cuda::std::numeric_limits::max()}` tuple is produced for zero-length inputs
+Does not support `<` operators that are non-commutative.
+Provides "run-to-run" determinism for pseudo-associative reduction (e.g., addition of floating point types) on the same GPU device. However, results for pseudo-associative reduction may be inconsistent from one device to a another device of a different compute-capability because CUB can employ different tile-sizing for different architectures.
+The range `[d_in, d_in + num_items)` shall not overlap `d_out`.
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access input iterator type for reading input items (of some type `T`) (may be a simple pointer type)
+
+
+
+**[inferred]** Output iterator type for recording the reduced aggregate (having value type `cub::KeyValuePair`) (may be a simple pointer type)
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Pointer to the input sequence of data items
+
+
+
+Pointer to the output aggregate
+
+
+
+Total number of input items (i.e., length of `d_in`)
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+**Example**
+
+The code snippet below illustrates the argmin-reduction of a device vector of `int` data elements.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+// Declare, allocate, and initialize device-accessible pointers
+// for input and output
+int num_items; // e.g., 7
+int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
+KeyValuePair *d_argmin; // e.g., [{-,-}]
+...
+
+// Determine temporary device storage requirements
+void *d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_argmin, num_items);
+
+// Allocate temporary storage
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Run argmin-reduction
+cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_argmin, num_items);
+
+// d_argmin <-- [{5, 0}]
+```
+
+
+
+
+### Max inline static
+
+
+
+
+Computes a device-wide maximum using the greater-than (`>`) operator.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceReduce::Max(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ InputIteratorT d_in,
+ OutputIteratorT d_out,
+ NumItemsT num_items,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+Uses `cuda::std::numeric_limits::lowest()` as the initial value of the reduction.
+Does not support `>` operators that are non-commutative.
+Provides "run-to-run" determinism for pseudo-associative reduction (e.g., addition of floating point types) on the same GPU device. However, results for pseudo-associative reduction may be inconsistent from one device to a another device of a different compute-capability because CUB can employ different tile-sizing for different architectures.
+The range `[d_in, d_in + num_items)` shall not overlap `d_out`.
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access input iterator type for reading input items (may be a simple pointer type)
+
+
+
+**[inferred]** Output iterator type for recording the reduced aggregate (may be a simple pointer type)
+
+
+
+**[inferred]** Type of num_items
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Pointer to the input sequence of data items
+
+
+
+Pointer to the output aggregate
+
+
+
+Total number of input items (i.e., length of `d_in`)
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+**Example**
+
+The code snippet below illustrates the max-reduction of a device vector of `int` data elements.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+// Declare, allocate, and initialize device-accessible pointers
+// for input and output
+int num_items; // e.g., 7
+int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
+int *d_max; // e.g., [-]
+...
+
+// Determine temporary device storage requirements
+void *d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_max, num_items);
+
+// Allocate temporary storage
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Run max-reduction
+cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_max, num_items);
+
+// d_max <-- [9]
+```
+
+
+
+
+nodiscard
+
+Computes a device-wide maximum using the greater-than (`>`) operator. The result is written to the output iterator.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceReduce::Max(
+ InputIteratorT d_in,
+ OutputIteratorT d_out,
+ NumItemsT num_items,
+ EnvT env = {}
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+Uses `cuda::std::numeric_limits::lowest()` as the initial value of the reduction.
+Provides determinism based on the environment's determinism requirements. To request "run-to-run" determinism, pass `cuda::execution::require(cuda::execution::determinism::run_to_run)` as the `env` parameter.
+The range `[d_in, d_in + num_items)` shall not overlap `d_out`.
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access input iterator type for reading input items (may be a simple pointer type)
+
+
+
+**[inferred]** Output iterator type for recording the reduced aggregate (may be a simple pointer type)
+
+
+
+**[inferred]** Type of num_items
+
+
+
+**[inferred]** Execution environment type. Default is `cuda::std::execution::env<>`.
+
+
+**Parameters**
+
+
+Pointer to the input sequence of data items
+
+
+
+Pointer to the output aggregate
+
+
+
+Total number of input items (i.e., length of `d_in`)
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``.
+//!
+
+
+
+
+
+### ArgMax inline static
+
+
+
+
+Finds the first device-wide maximum using the greater-than (`>`) operator and also returns the index of that item.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceReduce::ArgMax(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ InputIteratorT d_in,
+ ExtremumOutIteratorT d_max_out,
+ IndexOutIteratorT d_index_out,
+ ::cuda::std::int64_t num_items,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The maximum is written to `d_max_out`
+The offset of the returned item is written to `d_index_out`, the offset type being written is of type `cuda::std::int64_t`.
+For zero-length inputs, `cuda::std::numeric_limits::max()}` is written to `d_max_out` and the index `1` is written to `d_index_out`.
+Does not support `>` operators that are non-commutative.
+Provides "run-to-run" determinism for pseudo-associative reduction (e.g., addition of floating point types) on the same GPU device. However, results for pseudo-associative reduction may be inconsistent from one device to a another device of a different compute-capability because CUB can employ different tile-sizing for different architectures.
+The range `[d_in, d_in + num_items)` shall not overlap `d_out`.
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access input iterator type for reading input items (of some type `T`) (may be a simple pointer type)
+
+
+
+**[inferred]** Output iterator type for recording maximum value
+
+
+
+**[inferred]** Output iterator type for recording index of the returned value
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Pointer to the input sequence of data items
+
+
+
+Iterator to which the maximum value is written
+
+
+
+Iterator to which the index of the returned value is written
+
+
+
+Total number of input items (i.e., length of `d_in`)
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+**Example**
+
+The code snippet below illustrates the argmax-reduction of a device vector of `int` data elements.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+#include
+
+// Declare, allocate, and initialize device-accessible pointers
+// for input and output
+int num_items; // e.g., 7
+int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
+int *d_max_out; // memory for the maximum value
+cuda::std::int64_t *d_index_out; // memory for the index of the returned value
+...
+
+// Determine temporary device storage requirements
+void *d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+cub::DeviceReduce::ArgMax(
+ d_temp_storage, temp_storage_bytes, d_in, d_max_out, d_index_out, num_items);
+
+// Allocate temporary storage
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Run argmax-reduction
+cub::DeviceReduce::ArgMax(
+ d_temp_storage, temp_storage_bytes, d_in, d_max_out, d_index_out, num_items);
+
+// d_max_out <-- 9
+// d_index_out <-- 6
+```
+
+
+
+
+Finds the first device-wide maximum using the greater-than (`>`) operator, also returning the index of that item
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceReduce::ArgMax(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ InputIteratorT d_in,
+ OutputIteratorT d_out,
+ int num_items,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The output value type of `d_out` is `cub::KeyValuePair` (assuming the value type of `d_in` is `T`)
+The maximum is written to `d_out.value` and its offset in the input array is written to `d_out.key`.
+The `{1, cuda::std::numeric_limits::lowest()}` tuple is produced for zero-length inputs
+Does not support `>` operators that are non-commutative.
+Provides "run-to-run" determinism for pseudo-associative reduction (e.g., addition of floating point types) on the same GPU device. However, results for pseudo-associative reduction may be inconsistent from one device to a another device of a different compute-capability because CUB can employ different tile-sizing for different architectures.
+The range `[d_in, d_in + num_items)` shall not overlap `d_out`.
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access input iterator type for reading input items (of some type `T`) (may be a simple pointer type)
+
+
+
+**[inferred]** Output iterator type for recording the reduced aggregate (having value type `cub::KeyValuePair`) (may be a simple pointer type)
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Pointer to the input sequence of data items
+
+
+
+Pointer to the output aggregate
+
+
+
+Total number of input items (i.e., length of `d_in`)
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+**Example**
+
+The code snippet below illustrates the argmax-reduction of a device vector of `int` data elements.
+
+```cpp showLineNumbers={false}
+#include
+// or equivalently
+
+// Declare, allocate, and initialize device-accessible pointers
+// for input and output
+int num_items; // e.g., 7
+int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
+KeyValuePair *d_argmax; // e.g., [{-,-}]
+...
+
+// Determine temporary device storage requirements
+void *d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+cub::DeviceReduce::ArgMax(
+ d_temp_storage, temp_storage_bytes, d_in, d_argmax, num_items);
+
+// Allocate temporary storage
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Run argmax-reduction
+cub::DeviceReduce::ArgMax(
+ d_temp_storage, temp_storage_bytes, d_in, d_argmax, num_items);
+
+// d_argmax <-- [{6, 9}]
+```
+
+
+
+
+nodiscard
+
+Finds the first device-wide maximum using the greater-than (`>`) operator and also returns the index of that item.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceReduce::ArgMax(
+ InputIteratorT d_in,
+ ExtremumOutIteratorT d_max_out,
+ IndexOutIteratorT d_index_out,
+ ::cuda::std::int64_t num_items,
+ EnvT env = {}
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The maximum is written to `d_max_out`
+The offset of the returned item is written to `d_index_out`, the offset type being written is of type `cuda::std::int64_t`.
+For zero-length inputs, `cuda::std::numeric_limits::lowest()}` is written to `d_max_out` and the index `1` is written to `d_index_out`.
+Does not support `>` operators that are non-commutative.
+Provides determinism based on the environment's determinism requirements. To request "run-to-run" determinism, pass `cuda::execution::require(cuda::execution::determinism::run_to_run)` as the `env` parameter.
+The range `[d_in, d_in + num_items)` shall not overlap `d_max_out` nor `d_index_out`.
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access input iterator type for reading input items (of some type `T`) (may be a simple pointer type)
+
+
+
+**[inferred]** Output iterator type for recording maximum value
+
+
+
+**[inferred]** Output iterator type for recording index of the returned value
+
+
+
+**[inferred]** Execution environment type. Default is `cuda::std::execution::env<>`.
+
+
+**Parameters**
+
+
+Iterator to the input sequence of data items
+
+
+
+Iterator to which the maximum value is written
+
+
+
+Iterator to which the index of the returned value is written
+
+
+
+Total number of input items (i.e., length of `d_in`)
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``.
+//!
+
+
+
+
+
+### TransformReduce inline static
+
+Fuses transform and reduce operations
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceReduce::TransformReduce(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ InputIteratorT d_in,
+ OutputIteratorT d_out,
+ NumItemsT num_items,
+ ReductionOpT reduction_op,
+ TransformOpT transform_op,
+ T init,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+Does not support binary reduction operators that are non-commutative.
+Provides "run-to-run" determinism for pseudo-associative reduction (e.g., addition of floating point types) on the same GPU device. However, results for pseudo-associative reduction may be inconsistent from one device to a another device of a different compute-capability because CUB can employ different tile-sizing for different architectures.
+The range `[d_in, d_in + num_items)` shall not overlap `d_out`.
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access input iterator type for reading input items (may be a simple pointer type)
+
+
+
+**[inferred]** Output iterator type for recording the reduced aggregate (may be a simple pointer type)
+
+
+
+**[inferred]** Binary reduction functor type having member `T operator()(const T &a, const T &b)`
+
+
+
+**[inferred]** Unary reduction functor type having member `auto operator()(const T &a)`
+
+
+
+**[inferred]** Data element type that is convertible to the `value` type of `InputIteratorT`
+
+
+
+**[inferred]** Type of num_items
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Pointer to the input sequence of data items
+
+
+
+Pointer to the output aggregate
+
+
+
+Total number of input items (i.e., length of `d_in`)
+
+
+
+Binary reduction functor
+
+
+
+Unary transform functor
+
+
+
+Initial value of the reduction
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+**Example**
+
+The code snippet below illustrates a user-defined min-reduction of a device vector of `int` data elements.
+
+```cpp showLineNumbers={false}
+#include
+// or equivalently
+
+thrust::device_vector in = { 1, 2, 3, 4 };
+thrust::device_vector out(1);
+
+size_t temp_storage_bytes = 0;
+uint8_t *d_temp_storage = nullptr;
+
+const int init = 42;
+
+cub::DeviceReduce::TransformReduce(
+ d_temp_storage,
+ temp_storage_bytes,
+ in.begin(),
+ out.begin(),
+ in.size(),
+ cuda::std::plus<>{},
+ square_t{},
+ init);
+
+thrust::device_vector temp_storage(temp_storage_bytes);
+d_temp_storage = temp_storage.data().get();
+
+cub::DeviceReduce::TransformReduce(
+ d_temp_storage,
+ temp_storage_bytes,
+ in.begin(),
+ out.begin(),
+ in.size(),
+ cuda::std::plus<>{},
+ square_t{},
+ init);
+
+// out[0] <-- 72
+```
+
+### ReduceByKey inline static
+
+Reduces segments of values, where segments are demarcated by corresponding runs of identical keys.
+
+This operation computes segmented reductions within `d_values_in` using the specified binary `reduction_op` functor. The segments are identified by "runs" of corresponding keys in `d_keys_in`, where runs are maximal ranges of consecutive, identical keys. For the *i*th run encountered, the last key of the run and the corresponding value aggregate of that run are written to `d_unique_out[i]` and `d_aggregates_out[i]`, respectively. The total number of runs encountered is written to `d_num_runs_out`.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceReduce::ReduceByKey(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ KeysInputIteratorT d_keys_in,
+ UniqueOutputIteratorT d_unique_out,
+ ValuesInputIteratorT d_values_in,
+ AggregatesOutputIteratorT d_aggregates_out,
+ NumRunsOutputIteratorT d_num_runs_out,
+ ReductionOpT reduction_op,
+ NumItemsT num_items,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The `==` equality operator is used to determine whether keys are equivalent
+Provides "run-to-run" determinism for pseudo-associative reduction (e.g., addition of floating point types) on the same GPU device. However, results for pseudo-associative reduction may be inconsistent from one device to a another device of a different compute-capability because CUB can employ different tile-sizing for different architectures.
+Let `out` be any of `[d_unique_out, d_unique_out + *d_num_runs_out)` `[d_aggregates_out, d_aggregates_out + *d_num_runs_out)` `d_num_runs_out`. The ranges represented by `out` shall not overlap `[d_keys_in, d_keys_in + num_items)`, `[d_values_in, d_values_in + num_items)` nor `out` in any way.
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access input iterator type for reading input keys (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access output iterator type for writing unique output keys (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access input iterator type for reading input values (may be a simple pointer type)
+
+
+
+**[inferred]** Output iterator type for recording the number of runs encountered (may be a simple pointer type)
+
+
+
+**[inferred]** Binary reduction functor type having member `T operator()(const T &a, const T &b)`
+
+
+
+**[inferred]** Type of num_items
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Pointer to the input sequence of keys
+
+
+
+Pointer to the output sequence of unique keys (one key per run)
+
+
+
+Pointer to the input sequence of corresponding values
+
+
+
+Pointer to the output sequence of value aggregates (one aggregate per run)
+
+
+
+Pointer to total number of runs encountered (i.e., the length of `d_unique_out`)
+
+
+
+Binary reduction functor
+
+
+
+Total number of associated key+value pairs (i.e., the length of `d_in_keys` and `d_in_values`)
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+**Example**
+
+The code snippet below illustrates the segmented reduction of `int` values grouped by runs of associated `int` keys.
+
+```cpp showLineNumbers={false}
+#include
+// or equivalently
+
+// CustomMin functor
+struct CustomMin
+{
+ template
+ __device__ __forceinline__
+ T operator()(const T &a, const T &b) const {
+ return (b < a) ? b : a;
+ }
+};
+
+// Declare, allocate, and initialize device-accessible pointers
+// for input and output
+int num_items; // e.g., 8
+int *d_keys_in; // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
+int *d_values_in; // e.g., [0, 7, 1, 6, 2, 5, 3, 4]
+int *d_unique_out; // e.g., [-, -, -, -, -, -, -, -]
+int *d_aggregates_out; // e.g., [-, -, -, -, -, -, -, -]
+int *d_num_runs_out; // e.g., [-]
+CustomMin reduction_op;
+...
+
+// Determine temporary device storage requirements
+void *d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+cub::DeviceReduce::ReduceByKey(
+ d_temp_storage, temp_storage_bytes,
+ d_keys_in, d_unique_out, d_values_in,
+ d_aggregates_out, d_num_runs_out, reduction_op, num_items);
+
+// Allocate temporary storage
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Run reduce-by-key
+cub::DeviceReduce::ReduceByKey(
+ d_temp_storage, temp_storage_bytes,
+ d_keys_in, d_unique_out, d_values_in,
+ d_aggregates_out, d_num_runs_out, reduction_op, num_items);
+
+// d_unique_out <-- [0, 2, 9, 5, 8]
+// d_aggregates_out <-- [0, 1, 6, 2, 4]
+// d_num_runs_out <-- [5]
+```
diff --git a/fern/cudapages/cub/cub/cub/DeviceRleDispatch.mdx b/fern/cudapages/cub/cub/cub/DeviceRleDispatch.mdx
new file mode 100644
index 0000000..3efd30e
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/DeviceRleDispatch.mdx
@@ -0,0 +1,212 @@
+---
+title: cub::DeviceRleDispatch
+description: "Utility class for dispatching the appropriately-tuned kernels for DeviceRle."
+---
+
+Utility class for dispatching the appropriately-tuned kernels for DeviceRle.
+
+
+
+
+
+Random-access input iterator type for reading input items (may be a simple pointer type)
+
+
+
+Random-access output iterator type for writing run-offset values (may be a simple pointer type)
+
+
+
+Random-access output iterator type for writing run-length values (may be a simple pointer type)
+
+
+
+Output iterator type for recording the number of runs encountered (may be a simple pointer type)
+
+
+
+T equality operator type
+
+
+
+Signed integer type for global offsets
+
+
+
+Implementation detail, do not specify directly, requirements on the content of this type are subject to breaking change.
+
+
+
+
+
+---
+
+## Constructors
+
+### DeviceRleDispatch inline
+
+
+```cpp showLineNumbers={false}
+cub::DeviceRleDispatch::DeviceRleDispatch(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ InputIteratorT d_in,
+ OffsetsOutputIteratorT d_offsets_out,
+ LengthsOutputIteratorT d_lengths_out,
+ NumRunsOutputIteratorT d_num_runs_out,
+ EqualityOpT equality_op,
+ global_offset_t num_items,
+ cudaStream_t stream
+)
+```
+
+
+---
+
+## Methods
+
+### Invoke inline
+
+
+
+
+Internal dispatch routine for computing a device-wide run-length-encode using the specified kernel functions.
+
+
+```cpp showLineNumbers={false}
+template
+cudaError_t cub::DeviceRleDispatch::Invoke(
+ DeviceScanInitKernelPtr device_scan_init_kernel,
+ DeviceRleSweepKernelPtr device_rle_sweep_kernel
+)
+```
+
+
+**Template parameters**
+
+
+Function type of cub::DeviceScanInitKernel
+
+
+
+Function type of cub::DeviceRleSweepKernelPtr
+
+
+**Parameters**
+
+
+Kernel function pointer to parameterization of cub::DeviceScanInitKernel
+
+
+
+Kernel function pointer to parameterization of cub::DeviceRleSweepKernel
+
+
+
+
+
+
+```cpp showLineNumbers={false}
+template
+cudaError_t cub::DeviceRleDispatch::Invoke()
+```
+
+
+
+
+
+---
+
+## Static methods
+
+### Dispatch inline static
+
+Internal dispatch routine.
+
+
+```cpp showLineNumbers={false}
+static cudaError_t cub::DeviceRleDispatch::Dispatch(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ InputIteratorT d_in,
+ OffsetsOutputIteratorT d_offsets_out,
+ LengthsOutputIteratorT d_lengths_out,
+ NumRunsOutputIteratorT d_num_runs_out,
+ EqualityOpT equality_op,
+ OffsetT num_items,
+ cudaStream_t stream
+)
+```
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When nullptr, the required allocation size is written to [`temp_storage_bytes`](/library/api/cub::_device_rle_dispatch::temp_storage_bytes) and no work is done.
+
+
+
+Reference to size in bytes of [`d_temp_storage`](/library/api/cub::_device_rle_dispatch::d_temp_storage) allocation
+
+
+
+Pointer to input sequence of data items
+
+
+
+Pointer to output sequence of run-offsets
+
+
+
+Pointer to output sequence of run-lengths
+
+
+
+Pointer to total number of runs (i.e., length of [`d_offsets_out`](/library/api/cub::_device_rle_dispatch::d_offsets_out))
+
+
+
+Equality operator for input items
+
+
+
+Total number of input items (i.e., length of [`d_in`](/library/api/cub::_device_rle_dispatch::d_in))
+
+
+
+**[optional]** CUDA stream to launch kernels within. Default is stream0.
+
+
+---
+
+## Types
+
+### Typedefs
+
+| Name | Definition |
+|---|---|
+| `local_offset_t` | `::cuda::std::int32_t` |
+| `global_offset_t` | `OffsetT` |
+| `length_t` | `cub::detail::non_void_value_t< LengthsOutputIteratorT, global_offset_t >` |
+| `streaming_context_t` | `::cuda::std::conditional_t< use_streaming_invocation, detail::rle::streaming_context< InputIteratorT, length_t, global_offset_t >, NullType >` |
+| `ScanTileStateT` | `ReduceByKeyScanTileState< length_t, local_offset_t >` |
+
+---
+
+## Member variables
+
+| Name | Type | Description |
+|---|---|---|
+| `use_streaming_invocation` static constexpr | `bool` | |
+| `init_kernel_threads` static constexpr | `int` | |
+| `d_temp_storage` | `void *` | |
+| `temp_storage_bytes` | `size_t &` | |
+| `d_in` | `InputIteratorT` | |
+| `d_offsets_out` | `OffsetsOutputIteratorT` | |
+| `d_lengths_out` | `LengthsOutputIteratorT` | |
+| `d_num_runs_out` | `NumRunsOutputIteratorT` | |
+| `equality_op` | `EqualityOpT` | |
+| `num_items` | `global_offset_t` | |
+| `stream` | `cudaStream_t` | |
diff --git a/fern/cudapages/cub/cub/cub/DeviceRunLengthEncode.mdx b/fern/cudapages/cub/cub/cub/DeviceRunLengthEncode.mdx
new file mode 100644
index 0000000..cf09535
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/DeviceRunLengthEncode.mdx
@@ -0,0 +1,270 @@
+---
+title: cub::DeviceRunLengthEncode
+description: ""
+---
+
+DeviceRunLengthEncode provides device-wide, parallel operations for demarcating "runs" of same-valued items within a sequence residing within device-accessible memory.
+
+## Performance considerations
+
+@linear_performance{run-length encode}
+
+---
+
+## Static methods
+
+### Encode inline static
+
+Computes a run-length encoding of the sequence `d_in`.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceRunLengthEncode::Encode(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ InputIteratorT d_in,
+ UniqueOutputIteratorT d_unique_out,
+ LengthsOutputIteratorT d_counts_out,
+ NumRunsOutputIteratorT d_num_runs_out,
+ NumItemsT num_items,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+For the *i*th run encountered, the first key of the run and its length are written to `d_unique_out[i]` and `d_counts_out[i]`, respectively.
+The total number of runs encountered is written to `d_num_runs_out`.
+The `==` equality operator is used to determine whether values are equivalent
+In-place operations are not supported. There must be no overlap between any of the provided ranges:
+`[d_unique_out, d_unique_out + *d_num_runs_out)`
+`[d_counts_out, d_counts_out + *d_num_runs_out)`
+`[d_num_runs_out, d_num_runs_out + 1)`
+`[d_in, d_in + num_items)`
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access input iterator type for reading input items (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access output iterator type for writing unique output items (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access output iterator type for writing output counts (may be a simple pointer type)
+
+
+
+**[inferred]** Output iterator type for recording the number of runs encountered (may be a simple pointer type)
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Pointer to the input sequence of keys
+
+
+
+Pointer to the output sequence of unique keys (one key per run)
+
+
+
+Pointer to the output sequence of run-lengths (one count per run)
+
+
+
+Pointer to total number of runs
+
+
+
+Total number of associated key+value pairs (i.e., the length of `d_in_keys` and `d_in_values`)
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+**Example**
+
+The code snippet below illustrates the run-length encoding of a sequence of `int` values.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+// Declare, allocate, and initialize device-accessible pointers for
+// input and output
+int num_items; // e.g., 8
+int *d_in; // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
+int *d_unique_out; // e.g., [ , , , , , , , ]
+int *d_counts_out; // e.g., [ , , , , , , , ]
+int *d_num_runs_out; // e.g., [ ]
+...
+
+// Determine temporary device storage requirements
+void *d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+cub::DeviceRunLengthEncode::Encode(
+ d_temp_storage, temp_storage_bytes,
+ d_in, d_unique_out, d_counts_out, d_num_runs_out, num_items);
+
+// Allocate temporary storage
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Run encoding
+cub::DeviceRunLengthEncode::Encode(
+ d_temp_storage, temp_storage_bytes,
+ d_in, d_unique_out, d_counts_out, d_num_runs_out, num_items);
+
+// d_unique_out <-- [0, 2, 9, 5, 8]
+// d_counts_out <-- [1, 2, 1, 3, 1]
+// d_num_runs_out <-- [5]
+```
+
+### NonTrivialRuns inline static
+
+Enumerates the starting offsets and lengths of all non-trivial runs (of `length > 1`) of same-valued keys in the sequence `d_in`.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceRunLengthEncode::NonTrivialRuns(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ InputIteratorT d_in,
+ OffsetsOutputIteratorT d_offsets_out,
+ LengthsOutputIteratorT d_lengths_out,
+ NumRunsOutputIteratorT d_num_runs_out,
+ NumItemsT num_items,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+For the *i*th non-trivial run, the run's starting offset and its length are written to `d_offsets_out[i]` and `d_lengths_out[i]`, respectively.
+The total number of runs encountered is written to `d_num_runs_out`.
+The `==` equality operator is used to determine whether values are equivalent
+In-place operations are not supported. There must be no overlap between any of the provided ranges:
+`[d_offsets_out, d_offsets_out + *d_num_runs_out)`
+`[d_lengths_out, d_lengths_out + *d_num_runs_out)`
+`[d_num_runs_out, d_num_runs_out + 1)`
+`[d_in, d_in + num_items)`
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access input iterator type for reading input items (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access output iterator type for writing run-offset values (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access output iterator type for writing run-length values (may be a simple pointer type)
+
+
+
+**[inferred]** Output iterator type for recording the number of runs encountered (may be a simple pointer type)
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Pointer to input sequence of data items
+
+
+
+Pointer to output sequence of run-offsets (one offset per non-trivial run)
+
+
+
+Pointer to output sequence of run-lengths (one count per non-trivial run)
+
+
+
+Pointer to total number of runs (i.e., length of `d_offsets_out`)
+
+
+
+Total number of associated key+value pairs (i.e., the length of `d_in_keys` and `d_in_values`)
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+**Example**
+
+The code snippet below illustrates the identification of non-trivial runs within a sequence of `int` values.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+// Declare, allocate, and initialize device-accessible pointers
+// for input and output
+int num_items; // e.g., 8
+int *d_in; // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
+int *d_offsets_out; // e.g., [ , , , , , , , ]
+int *d_lengths_out; // e.g., [ , , , , , , , ]
+int *d_num_runs_out; // e.g., [ ]
+...
+
+// Determine temporary device storage requirements
+void *d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+cub::DeviceRunLengthEncode::NonTrivialRuns(
+ d_temp_storage, temp_storage_bytes,
+ d_in, d_offsets_out, d_lengths_out, d_num_runs_out, num_items);
+
+// Allocate temporary storage
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Run encoding
+cub::DeviceRunLengthEncode::NonTrivialRuns(
+ d_temp_storage, temp_storage_bytes,
+ d_in, d_offsets_out, d_lengths_out, d_num_runs_out, num_items);
+
+// d_offsets_out <-- [1, 4]
+// d_lengths_out <-- [2, 3]
+// d_num_runs_out <-- [2]
+```
diff --git a/fern/cudapages/cub/cub/cub/DeviceScan.mdx b/fern/cudapages/cub/cub/cub/DeviceScan.mdx
new file mode 100644
index 0000000..e0c1815
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/DeviceScan.mdx
@@ -0,0 +1,2197 @@
+---
+title: cub::DeviceScan
+description: ""
+---
+
+DeviceScan provides device-wide, parallel operations for computing a prefix scan across a sequence of data items residing within device-accessible memory.
+
+## Performance considerations
+
+@linear_performance{prefix scan}
+
+---
+
+## Exclusive scans
+
+### ExclusiveSum inline static
+
+
+
+
+Computes a device-wide exclusive prefix sum. The value of `0` is applied as the initial value, and is assigned to `*d_out`.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceScan::ExclusiveSum(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ InputIteratorT d_in,
+ OutputIteratorT d_out,
+ NumItemsT num_items,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+Supports non-commutative sum operators.
+Results are not deterministic for pseudo-associative operators (e.g., addition of floating-point types). Results for pseudo-associative operators may vary from run to run. Additional details can be found in the @lookback description.
+When `d_in` and `d_out` are equal, the scan is performed in-place. The range `[d_in, d_in + num_items)` and `[d_out, d_out + num_items)` shall not overlap in any other way.
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access input iterator type for reading scan inputs (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access output iterator type for writing scan outputs (may be a simple pointer type)
+
+
+
+**[inferred]** An integral type representing the number of input elements
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Random-access iterator to the input sequence of data items
+
+
+
+Random-access iterator to the output sequence of data items
+
+
+
+Total number of input items (i.e., the length of `d_in`)
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+**Example**
+
+The code snippet below illustrates the exclusive prefix sum of an `int` device vector.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+// Declare, allocate, and initialize device-accessible pointers for
+// input and output
+int num_items; // e.g., 7
+int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
+int *d_out; // e.g., [ , , , , , , ]
+...
+
+// Determine temporary device storage requirements
+void *d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+cub::DeviceScan::ExclusiveSum(
+ d_temp_storage, temp_storage_bytes,
+ d_in, d_out, num_items);
+
+// Allocate temporary storage
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Run exclusive prefix sum
+cub::DeviceScan::ExclusiveSum(
+ d_temp_storage, temp_storage_bytes,
+ d_in, d_out, num_items);
+
+// d_out <-- [0, 8, 14, 21, 26, 29, 29]
+```
+
+
+
+
+nodiscard
+
+Computes a device-wide exclusive prefix sum. The value of `0` is applied as the initial value, and is assigned to `*d_out`.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceScan::ExclusiveSum(
+ InputIteratorT d_in,
+ OutputIteratorT d_out,
+ NumItemsT num_items,
+ EnvT env = {}
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+Supports non-commutative sum operators.
+Results are not deterministic for pseudo-associative operators (e.g., addition of floating-point types). Results for pseudo-associative operators may vary from run to run. Additional details can be found in the @lookback description.
+When `d_in` and `d_out` are equal, the scan is performed in-place. The range `[d_in, d_in + num_items)` and `[d_out, d_out + num_items)` shall not overlap in any other way.
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access input iterator type for reading scan inputs (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access output iterator type for writing scan outputs (may be a simple pointer type)
+
+
+
+**[inferred]** An integral type representing the number of input elements
+
+
+
+**[inferred]** Execution environment type. Default is `::cuda::std::execution::env<>`.
+
+
+**Parameters**
+
+
+Random-access iterator to the input sequence of data items
+
+
+
+Random-access iterator to the output sequence of data items
+
+
+
+Total number of input items (i.e., the length of `d_in`)
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** Execution environment. Default is `::cuda::std::execution::env{}`.
+//!
+
+
+
+
+
+Computes a device-wide exclusive prefix sum in-place. The value of `0` is applied as the initial value, and is assigned to `*d_data`.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceScan::ExclusiveSum(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ IteratorT d_data,
+ NumItemsT num_items,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+Supports non-commutative sum operators.
+Results are not deterministic for pseudo-associative operators (e.g., addition of floating-point types). Results for pseudo-associative operators may vary from run to run. Additional details can be found in the @lookback description.
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access iterator type for reading scan inputs and wrigin scan outputs
+
+
+
+**[inferred]** An integral type representing the number of input elements
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Random-access iterator to the sequence of data items
+
+
+
+Total number of input items (i.e., the length of `d_in`)
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+**Example**
+
+The code snippet below illustrates the exclusive prefix sum of an `int` device vector.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+// Declare, allocate, and initialize device-accessible pointers for
+// input and output
+int num_items; // e.g., 7
+int *d_data; // e.g., [8, 6, 7, 5, 3, 0, 9]
+...
+
+// Determine temporary device storage requirements
+void *d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+cub::DeviceScan::ExclusiveSum(
+ d_temp_storage, temp_storage_bytes,
+ d_data, num_items);
+
+// Allocate temporary storage
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Run exclusive prefix sum
+cub::DeviceScan::ExclusiveSum(
+ d_temp_storage, temp_storage_bytes,
+ d_data, num_items);
+
+// d_data <-- [0, 8, 14, 21, 26, 29, 29]
+```
+
+
+
+
+### ExclusiveScan inline static
+
+
+
+
+Computes a device-wide exclusive prefix scan using the specified binary associative `scan_op` functor. The `init_value` value is applied as the initial value, and is assigned to `*d_out`.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceScan::ExclusiveScan(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ InputIteratorT d_in,
+ OutputIteratorT d_out,
+ ScanOpT scan_op,
+ InitValueT init_value,
+ NumItemsT num_items,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+Supports non-commutative scan operators.
+Results are not deterministic for pseudo-associative operators (e.g., addition of floating-point types). Results for pseudo-associative operators may vary from run to run. Additional details can be found in the @lookback description.
+When `d_in` and `d_out` are equal, the scan is performed in-place. The range `[d_in, d_in + num_items)` and `[d_out, d_out + num_items)` shall not overlap in any other way.
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access input iterator type for reading scan inputs (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access output iterator type for writing scan outputs (may be a simple pointer type)
+
+
+
+**[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
+
+
+
+**[inferred]** Type of the `init_value`
+
+
+
+**[inferred]** An integral type representing the number of input elements
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Random-access iterator to the input sequence of data items
+
+
+
+Random-access iterator to the output sequence of data items
+
+
+
+Binary associative scan functor
+
+
+
+Initial value to seed the exclusive scan (and is assigned to `*d_out`)
+
+
+
+Total number of input items (i.e., the length of `d_in`)
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+**Example**
+
+The code snippet below illustrates the exclusive prefix min-scan of an `int` device vector
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+#include // for INT_MAX
+
+// CustomMin functor
+struct CustomMin
+{
+ template
+ __host__ __device__ __forceinline__
+ T operator()(const T &a, const T &b) const {
+ return (b < a) ? b : a;
+ }
+};
+
+// Declare, allocate, and initialize device-accessible pointers for
+// input and output
+int num_items; // e.g., 7
+int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
+int *d_out; // e.g., [ , , , , , , ]
+CustomMin min_op;
+...
+
+// Determine temporary device storage requirements for exclusive
+// prefix scan
+void *d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+cub::DeviceScan::ExclusiveScan(
+ d_temp_storage, temp_storage_bytes,
+ d_in, d_out, min_op, (int) INT_MAX, num_items);
+
+// Allocate temporary storage for exclusive prefix scan
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Run exclusive prefix min-scan
+cub::DeviceScan::ExclusiveScan(
+ d_temp_storage, temp_storage_bytes,
+ d_in, d_out, min_op, (int) INT_MAX, num_items);
+
+// d_out <-- [2147483647, 8, 6, 6, 5, 3, 0]
+```
+
+
+
+
+nodiscard
+
+Computes a device-wide exclusive prefix scan using the specified binary associative `scan_op` functor. The `init_value` value is applied as the initial value, and is assigned to `*d_out`.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceScan::ExclusiveScan(
+ InputIteratorT d_in,
+ OutputIteratorT d_out,
+ ScanOpT scan_op,
+ InitValueT init_value,
+ NumItemsT num_items,
+ EnvT env = {}
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+Supports non-commutative scan operators.
+Results are not deterministic for pseudo-associative operators (e.g., addition of floating-point types). Results for pseudo-associative operators may vary from run to run. Additional details can be found in the @lookback description.
+When `d_in` and `d_out` are equal, the scan is performed in-place. The range `[d_in, d_in + num_items)` and `[d_out, d_out + num_items)` shall not overlap in any other way.
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access input iterator type for reading scan inputs (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access output iterator type for writing scan outputs (may be a simple pointer type)
+
+
+
+**[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
+
+
+
+**[inferred]** Type of the `init_value`
+
+
+
+**[inferred]** An integral type representing the number of input elements
+
+
+
+**[inferred]** Execution environment type. Default is `::cuda::std::execution::env<>`.
+
+
+**Parameters**
+
+
+Random-access iterator to the input sequence of data items
+
+
+
+Random-access iterator to the output sequence of data items
+
+
+
+Binary associative scan functor
+
+
+
+Initial value to seed the exclusive scan (and is assigned to `*d_out`)
+
+
+
+Total number of input items (i.e., the length of `d_in`)
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** Execution environment. Default is `::cuda::std::execution::env{}`.
+//!
+
+
+
+
+
+Computes a device-wide exclusive prefix scan using the specified binary associative `scan_op` functor. The `init_value` value is applied as the initial value, and is assigned to `*d_data`.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceScan::ExclusiveScan(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ IteratorT d_data,
+ ScanOpT scan_op,
+ InitValueT init_value,
+ NumItemsT num_items,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+Supports non-commutative scan operators.
+Results are not deterministic for pseudo-associative operators (e.g., addition of floating-point types). Results for pseudo-associative operators may vary from run to run. Additional details can be found in the @lookback description.
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access input iterator type for reading scan inputs and writing scan outputs
+
+
+
+**[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
+
+
+
+**[inferred]** Type of the `init_value`
+
+
+
+**[inferred]** An integral type representing the number of input elements
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Random-access iterator to the sequence of data items
+
+
+
+Binary associative scan functor
+
+
+
+Initial value to seed the exclusive scan (and is assigned to `*d_out`)
+
+
+
+Total number of input items (i.e., the length of `d_in`)
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+**Example**
+
+The code snippet below illustrates the exclusive prefix min-scan of an `int` device vector:
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+#include // for INT_MAX
+
+// CustomMin functor
+struct CustomMin
+{
+ template
+ __host__ __device__ __forceinline__
+ T operator()(const T &a, const T &b) const {
+ return (b < a) ? b : a;
+ }
+};
+
+// Declare, allocate, and initialize device-accessible pointers for
+// input and output
+int num_items; // e.g., 7
+int *d_data; // e.g., [8, 6, 7, 5, 3, 0, 9]
+CustomMin min_op;
+...
+
+// Determine temporary device storage requirements for exclusive
+// prefix scan
+void *d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+cub::DeviceScan::ExclusiveScan(
+ d_temp_storage, temp_storage_bytes,
+ d_data, min_op, (int) INT_MAX, num_items);
+
+// Allocate temporary storage for exclusive prefix scan
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Run exclusive prefix min-scan
+cub::DeviceScan::ExclusiveScan(
+ d_temp_storage, temp_storage_bytes,
+ d_data, min_op, (int) INT_MAX, num_items);
+
+// d_data <-- [2147483647, 8, 6, 6, 5, 3, 0]
+```
+
+
+
+
+Computes a device-wide exclusive prefix scan using the specified binary associative `scan_op` functor. The `init_value` value is provided as a future value.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceScan::ExclusiveScan(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ InputIteratorT d_in,
+ OutputIteratorT d_out,
+ ScanOpT scan_op,
+ FutureValue init_value,
+ NumItemsT num_items,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+Supports non-commutative scan operators.
+Results are not deterministic for pseudo-associative operators (e.g., addition of floating-point types). Results for pseudo-associative operators may vary from run to run. Additional details can be found in the @lookback description.
+When `d_in` and `d_out` are equal, the scan is performed in-place. The range `[d_in, d_in + num_items)` and `[d_out, d_out + num_items)` shall not overlap in any other way.
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access input iterator type for reading scan inputs (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access output iterator type for writing scan outputs (may be a simple pointer type)
+
+
+
+**[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
+
+
+
+**[inferred]** Type of the `init_value`
+
+
+
+**[inferred]** An integral type representing the number of input elements
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Pointer to the input sequence of data items
+
+
+
+Pointer to the output sequence of data items
+
+
+
+Binary associative scan functor
+
+
+
+Initial value to seed the exclusive scan (and is assigned to `*d_out`)
+
+
+
+Total number of input items (i.e., the length of `d_in`)
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+**Example**
+
+The code snippet below illustrates the exclusive prefix min-scan of an `int` device vector
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+#include // for INT_MAX
+
+// CustomMin functor
+struct CustomMin
+{
+ template
+ __host__ __device__ __forceinline__
+ T operator()(const T &a, const T &b) const {
+ return (b < a) ? b : a;
+ }
+};
+
+// Declare, allocate, and initialize device-accessible pointers for
+// input and output
+int num_items; // e.g., 7
+int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
+int *d_out; // e.g., [ , , , , , , ]
+int *d_init_iter; // e.g., INT_MAX
+CustomMin min_op;
+
+auto future_init_value =
+ cub::FutureValue(d_init_iter);
+
+...
+
+// Determine temporary device storage requirements for exclusive
+// prefix scan
+void *d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+cub::DeviceScan::ExclusiveScan(
+ d_temp_storage, temp_storage_bytes,
+ d_in, d_out, min_op, future_init_value, num_items);
+
+// Allocate temporary storage for exclusive prefix scan
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Run exclusive prefix min-scan
+cub::DeviceScan::ExclusiveScan(
+ d_temp_storage, temp_storage_bytes,
+ d_in, d_out, min_op, future_init_value, num_items);
+
+// d_out <-- [2147483647, 8, 6, 6, 5, 3, 0]
+```
+
+
+
+
+Computes a device-wide exclusive prefix scan using the specified binary associative `scan_op` functor. The `init_value` value is provided as a future value.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceScan::ExclusiveScan(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ IteratorT d_data,
+ ScanOpT scan_op,
+ FutureValue init_value,
+ NumItemsT num_items,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+Supports non-commutative scan operators.
+Results are not deterministic for pseudo-associative operators (e.g., addition of floating-point types). Results for pseudo-associative operators may vary from run to run. Additional details can be found in the @lookback description.
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access input iterator type for reading scan inputs and writing scan outputs
+
+
+
+**[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
+
+
+
+**[inferred]** Type of the `init_value`
+
+
+
+**[inferred]** An integral type representing the number of input elements
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Pointer to the sequence of data items
+
+
+
+Binary associative scan functor
+
+
+
+Initial value to seed the exclusive scan (and is assigned to `*d_out`)
+
+
+
+Total number of input items (i.e., the length of `d_in`)
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+**Example**
+
+The code snippet below illustrates the exclusive prefix min-scan of an `int` device vector
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+#include // for INT_MAX
+
+// CustomMin functor
+struct CustomMin
+{
+ template
+ __host__ __device__ __forceinline__
+ T operator()(const T &a, const T &b) const {
+ return (b < a) ? b : a;
+ }
+};
+
+// Declare, allocate, and initialize device-accessible pointers for
+// input and output
+int num_items; // e.g., 7
+int *d_data; // e.g., [8, 6, 7, 5, 3, 0, 9]
+int *d_init_iter; // e.g., INT_MAX
+CustomMin min_op;
+
+auto future_init_value =
+ cub::FutureValue(d_init_iter);
+
+...
+
+// Determine temporary device storage requirements for exclusive
+// prefix scan
+void *d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+cub::DeviceScan::ExclusiveScan(
+ d_temp_storage, temp_storage_bytes,
+ d_data, min_op, future_init_value, num_items);
+
+// Allocate temporary storage for exclusive prefix scan
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Run exclusive prefix min-scan
+cub::DeviceScan::ExclusiveScan(
+ d_temp_storage, temp_storage_bytes,
+ d_data, min_op, future_init_value, num_items);
+
+// d_data <-- [2147483647, 8, 6, 6, 5, 3, 0]
+```
+
+
+
+
+---
+
+## Inclusive scans
+
+### InclusiveSum inline static
+
+
+
+
+Computes a device-wide inclusive prefix sum.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceScan::InclusiveSum(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ InputIteratorT d_in,
+ OutputIteratorT d_out,
+ NumItemsT num_items,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+Supports non-commutative sum operators.
+Results are not deterministic for pseudo-associative operators (e.g., addition of floating-point types). Results for pseudo-associative operators may vary from run to run. Additional details can be found in the @lookback description.
+When `d_in` and `d_out` are equal, the scan is performed in-place. The range `[d_in, d_in + num_items)` and `[d_out, d_out + num_items)` shall not overlap in any other way.
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access input iterator type for reading scan inputs (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access output iterator type for writing scan outputs (may be a simple pointer type)
+
+
+
+**[inferred]** An integral type representing the number of input elements
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Random-access iterator to the input sequence of data items
+
+
+
+Random-access iterator to the output sequence of data items
+
+
+
+Total number of input items (i.e., the length of `d_in`)
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+**Example**
+
+The code snippet below illustrates the inclusive prefix sum of an `int` device vector.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+// Declare, allocate, and initialize device-accessible pointers for
+// input and output
+int num_items; // e.g., 7
+int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
+int *d_out; // e.g., [ , , , , , , ]
+...
+
+// Determine temporary device storage requirements for inclusive
+// prefix sum
+void *d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+cub::DeviceScan::InclusiveSum(
+ d_temp_storage, temp_storage_bytes,
+ d_in, d_out, num_items);
+
+// Allocate temporary storage for inclusive prefix sum
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Run inclusive prefix sum
+cub::DeviceScan::InclusiveSum(
+ d_temp_storage, temp_storage_bytes,
+ d_in, d_out, num_items);
+
+// d_out <-- [8, 14, 21, 26, 29, 29, 38]
+```
+
+
+
+
+Computes a device-wide inclusive prefix sum in-place.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceScan::InclusiveSum(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ IteratorT d_data,
+ NumItemsT num_items,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+Supports non-commutative sum operators.
+Results are not deterministic for pseudo-associative operators (e.g., addition of floating-point types). Results for pseudo-associative operators may vary from run to run. Additional details can be found in the @lookback description.
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access input iterator type for reading scan inputs and writing scan outputs
+
+
+
+**[inferred]** An integral type representing the number of input elements
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Random-access iterator to the sequence of data items
+
+
+
+Total number of input items (i.e., the length of `d_in`)
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+**Example**
+
+The code snippet below illustrates the inclusive prefix sum of an `int` device vector.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+// Declare, allocate, and initialize device-accessible pointers for
+// input and output
+int num_items; // e.g., 7
+int *d_data; // e.g., [8, 6, 7, 5, 3, 0, 9]
+...
+
+// Determine temporary device storage requirements for inclusive
+// prefix sum
+void *d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+cub::DeviceScan::InclusiveSum(
+ d_temp_storage, temp_storage_bytes,
+ d_data, num_items);
+
+// Allocate temporary storage for inclusive prefix sum
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Run inclusive prefix sum
+cub::DeviceScan::InclusiveSum(
+ d_temp_storage, temp_storage_bytes,
+ d_data, num_items);
+
+// d_data <-- [8, 14, 21, 26, 29, 29, 38]
+```
+
+
+
+
+### InclusiveScan inline static
+
+
+
+
+Computes a device-wide inclusive prefix scan using the specified binary associative `scan_op` functor.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceScan::InclusiveScan(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ InputIteratorT d_in,
+ OutputIteratorT d_out,
+ ScanOpT scan_op,
+ NumItemsT num_items,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+Supports non-commutative scan operators.
+Results are not deterministic for pseudo-associative operators (e.g., addition of floating-point types). Results for pseudo-associative operators may vary from run to run. Additional details can be found in the @lookback description.
+When `d_in` and `d_out` are equal, the scan is performed in-place. The range `[d_in, d_in + num_items)` and `[d_out, d_out + num_items)` shall not overlap in any other way.
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access input iterator type for reading scan inputs (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access output iterator type for writing scan outputs (may be a simple pointer type)
+
+
+
+**[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
+
+
+
+**[inferred]** An integral type representing the number of input elements
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Random-access iterator to the input sequence of data items
+
+
+
+Random-access iterator to the output sequence of data items
+
+
+
+Binary associative scan functor
+
+
+
+Total number of input items (i.e., the length of `d_in`)
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+**Example**
+
+The code snippet below illustrates the inclusive prefix min-scan of an `int` device vector.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+#include // for INT_MAX
+
+// CustomMin functor
+struct CustomMin
+{
+ template
+ __host__ __device__ __forceinline__
+ T operator()(const T &a, const T &b) const {
+ return (b < a) ? b : a;
+ }
+};
+
+// Declare, allocate, and initialize device-accessible pointers for
+// input and output
+int num_items; // e.g., 7
+int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
+int *d_out; // e.g., [ , , , , , , ]
+CustomMin min_op;
+...
+
+// Determine temporary device storage requirements for inclusive
+// prefix scan
+void *d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+cub::DeviceScan::InclusiveScan(
+ d_temp_storage, temp_storage_bytes,
+ d_in, d_out, min_op, num_items);
+
+// Allocate temporary storage for inclusive prefix scan
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Run inclusive prefix min-scan
+cub::DeviceScan::InclusiveScan(
+ d_temp_storage, temp_storage_bytes,
+ d_in, d_out, min_op, num_items);
+
+// d_out <-- [8, 6, 6, 5, 3, 0, 0]
+```
+
+
+
+
+Computes a device-wide inclusive prefix scan using the specified binary associative `scan_op` functor.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceScan::InclusiveScan(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ IteratorT d_data,
+ ScanOpT scan_op,
+ NumItemsT num_items,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+Supports non-commutative scan operators.
+Results are not deterministic for pseudo-associative operators (e.g., addition of floating-point types). Results for pseudo-associative operators may vary from run to run. Additional details can be found in the @lookback description.
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access input iterator type for reading scan inputs and writing scan outputs
+
+
+
+**[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
+
+
+
+**[inferred]** An integral type representing the number of input elements
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Random-access iterator to the sequence of data items
+
+
+
+Binary associative scan functor
+
+
+
+Total number of input items (i.e., the length of `d_in`)
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+**Example**
+
+The code snippet below illustrates the inclusive prefix min-scan of an `int` device vector.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+#include // for INT_MAX
+
+// CustomMin functor
+struct CustomMin
+{
+ template
+ __host__ __device__ __forceinline__
+ T operator()(const T &a, const T &b) const {
+ return (b < a) ? b : a;
+ }
+};
+
+// Declare, allocate, and initialize device-accessible pointers for
+// input and output
+int num_items; // e.g., 7
+int *d_data; // e.g., [8, 6, 7, 5, 3, 0, 9]
+CustomMin min_op;
+...
+
+// Determine temporary device storage requirements for inclusive
+// prefix scan
+void *d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+cub::DeviceScan::InclusiveScan(
+ d_temp_storage, temp_storage_bytes,
+ d_data, min_op, num_items);
+
+// Allocate temporary storage for inclusive prefix scan
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Run inclusive prefix min-scan
+cub::DeviceScan::InclusiveScan(
+ d_temp_storage, temp_storage_bytes,
+ d_in, d_out, min_op, num_items);
+
+// d_data <-- [8, 6, 6, 5, 3, 0, 0]
+```
+
+
+
+
+nodiscard
+
+Computes a device-wide inclusive prefix scan using the specified binary associative `scan_op` functor.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceScan::InclusiveScan(
+ InputIteratorT d_in,
+ OutputIteratorT d_out,
+ ScanOpT scan_op,
+ NumItemsT num_items,
+ EnvT env = {}
+)
+```
+
+
+
+Supports non-commutative scan operators.
+Results are not deterministic for pseudo-associative operators (e.g., addition of floating-point types). Results for pseudo-associative operators may vary from run to run. Additional details can be found in the @lookback description.
+When `d_in` and `d_out` are equal, the scan is performed in-place. The range `[d_in, d_in + num_items)` and `[d_out, d_out + num_items)` shall not overlap in any other way.
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access input iterator type for reading scan inputs (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access output iterator type for writing scan outputs (may be a simple pointer type)
+
+
+
+**[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
+
+
+
+**[inferred]** An integral type representing the number of input elements
+
+
+
+**[inferred]** Execution environment type. Default is `::cuda::std::execution::env<>`.
+
+
+**Parameters**
+
+
+Random-access iterator to the input sequence of data items
+
+
+
+Random-access iterator to the output sequence of data items
+
+
+
+Binary associative scan functor
+
+
+
+Total number of input items (i.e., the length of `d_in`)
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** Execution environment. Default is `::cuda::std::execution::env{}`.
+//!
+
+
+
+
+
+### InclusiveScanInit inline static
+
+
+
+
+Computes a device-wide inclusive prefix scan using the specified binary associative `scan_op` functor. The result of applying the `scan_op` binary operator to `init_value` value and `*d_in` is assigned to `*d_out`.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceScan::InclusiveScanInit(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ InputIteratorT d_in,
+ OutputIteratorT d_out,
+ ScanOpT scan_op,
+ InitValueT init_value,
+ NumItemsT num_items,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+Supports non-commutative scan operators.
+Results are not deterministic for pseudo-associative operators (e.g., addition of floating-point types). Results for pseudo-associative operators may vary from run to run. Additional details can be found in the @lookback description.
+When `d_in` and `d_out` are equal, the scan is performed in-place. The range `[d_in, d_in + num_items)` and `[d_out, d_out + num_items)` shall not overlap in any other way.
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access input iterator type for reading scan inputs (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access output iterator type for writing scan outputs (may be a simple pointer type)
+
+
+
+**[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
+
+
+
+**[inferred]** Type of the `init_value`
+
+
+
+**[inferred]** An integral type representing the number of input elements
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to the size in bytes of the `d_temp_storage` allocation
+
+
+
+Random-access iterator to the input sequence of data items
+
+
+
+Random-access iterator to the output sequence of data items
+
+
+
+Binary associative scan functor
+
+
+
+Initial value to seed the inclusive scan (`scan_op(init_value, d_in[0])` is assigned to `*d_out`)
+
+
+
+Total number of input items (i.e., the length of `d_in`)
+
+
+
+CUDA stream to launch kernels within.
+
+
+
+
+
+nodiscard
+
+Computes a device-wide inclusive prefix scan using the specified binary associative `scan_op` functor. The result of applying the `scan_op` binary operator to `init_value` value and `*d_in` is assigned to `*d_out`.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceScan::InclusiveScanInit(
+ InputIteratorT d_in,
+ OutputIteratorT d_out,
+ ScanOpT scan_op,
+ InitValueT init_value,
+ NumItemsT num_items,
+ EnvT env = {}
+)
+```
+
+
+
+Supports non-commutative scan operators.
+Results are not deterministic for pseudo-associative operators (e.g., addition of floating-point types). Results for pseudo-associative operators may vary from run to run. Additional details can be found in the @lookback description.
+When `d_in` and `d_out` are equal, the scan is performed in-place. The range `[d_in, d_in + num_items)` and `[d_out, d_out + num_items)` shall not overlap in any other way.
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access input iterator type for reading scan inputs (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access output iterator type for writing scan outputs (may be a simple pointer type)
+
+
+
+**[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
+
+
+
+**[inferred]** Type of the `init_value`
+
+
+
+**[inferred]** An integral type representing the number of input elements
+
+
+
+**[inferred]** Execution environment type. Default is `::cuda::std::execution::env<>`.
+
+
+**Parameters**
+
+
+Random-access iterator to the input sequence of data items
+
+
+
+Random-access iterator to the output sequence of data items
+
+
+
+Binary associative scan functor
+
+
+
+Initial value to seed the inclusive scan (`scan_op(init_value, d_in[0])` is assigned to `*d_out`)
+
+
+
+Total number of input items (i.e., the length of `d_in`)
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** Execution environment. Default is `::cuda::std::execution::env{}`.
+//!
+
+
+
+
+
+---
+
+## Scans by key
+
+### ExclusiveSumByKey inline static
+
+Computes a device-wide exclusive prefix sum-by-key with key equality defined by `equality_op`. The value of `0` is applied as the initial value, and is assigned to the beginning of each segment in `d_values_out`.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceScan::ExclusiveSumByKey(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ KeysInputIteratorT d_keys_in,
+ ValuesInputIteratorT d_values_in,
+ ValuesOutputIteratorT d_values_out,
+ NumItemsT num_items,
+ EqualityOpT equality_op = EqualityOpT(),
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+Supports non-commutative sum operators.
+Results are not deterministic for pseudo-associative operators (e.g., addition of floating-point types). Results for pseudo-associative operators may vary from run to run. Additional details can be found in the @lookback description.
+`d_keys_in` may equal `d_values_out` but the range `[d_keys_in, d_keys_in + num_items)` and the range `[d_values_out, d_values_out + num_items)` shall not overlap otherwise.
+`d_values_in` may equal `d_values_out` but the range `[d_values_in, d_values_in + num_items)` and the range `[d_values_out, d_values_out + num_items)` shall not overlap otherwise.
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access input iterator type for reading scan keys inputs (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access input iterator type for reading scan values inputs (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access output iterator type for writing scan values outputs (may be a simple pointer type)
+
+
+
+**[inferred]** Functor type having member `T operator()(const T &a, const T &b)` for binary operations that defines the equality of keys
+
+
+
+**[inferred]** An integral type representing the number of input elements
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Random-access input iterator to the input sequence of key items
+
+
+
+Random-access input iterator to the input sequence of value items
+
+
+
+Random-access output iterator to the output sequence of value items
+
+
+
+Total number of input items (i.e., the length of `d_keys_in` and `d_values_in`)
+
+
+
+Binary functor that defines the equality of keys. Default is cuda::std::equal_to<>{}.
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+**Example**
+
+The code snippet below illustrates the exclusive prefix sum-by-key of an `int` device vector.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+// Declare, allocate, and initialize device-accessible pointers for
+// input and output
+int num_items; // e.g., 7
+int *d_keys_in; // e.g., [0, 0, 1, 1, 1, 2, 2]
+int *d_values_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
+int *d_values_out; // e.g., [ , , , , , , ]
+...
+
+// Determine temporary device storage requirements
+void *d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+cub::DeviceScan::ExclusiveSumByKey(
+ d_temp_storage, temp_storage_bytes,
+ d_keys_in, d_values_in, d_values_out, num_items);
+
+// Allocate temporary storage
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Run exclusive prefix sum
+cub::DeviceScan::ExclusiveSumByKey(
+ d_temp_storage, temp_storage_bytes,
+ d_keys_in, d_values_in, d_values_out, num_items);
+
+// d_values_out <-- [0, 8, 0, 7, 12, 0, 0]
+```
+
+### ExclusiveScanByKey inline static
+
+Computes a device-wide exclusive prefix scan-by-key using the specified binary associative `scan_op` functor. The key equality is defined by `equality_op`. The `init_value` value is applied as the initial value, and is assigned to the beginning of each segment in `d_values_out`.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceScan::ExclusiveScanByKey(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ KeysInputIteratorT d_keys_in,
+ ValuesInputIteratorT d_values_in,
+ ValuesOutputIteratorT d_values_out,
+ ScanOpT scan_op,
+ InitValueT init_value,
+ NumItemsT num_items,
+ EqualityOpT equality_op = EqualityOpT(),
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+Supports non-commutative scan operators.
+Results are not deterministic for pseudo-associative operators (e.g., addition of floating-point types). Results for pseudo-associative operators may vary from run to run. Additional details can be found in the @lookback description.
+`d_keys_in` may equal `d_values_out` but the range `[d_keys_in, d_keys_in + num_items)` and the range `[d_values_out, d_values_out + num_items)` shall not overlap otherwise.
+`d_values_in` may equal `d_values_out` but the range `[d_values_in, d_values_in + num_items)` and the range `[d_values_out, d_values_out + num_items)` shall not overlap otherwise.
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access input iterator type for reading scan keys inputs (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access input iterator type for reading scan values inputs (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access output iterator type for writing scan values outputs (may be a simple pointer type)
+
+
+
+**[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
+
+
+
+**[inferred]** Type of the `init_value`
+
+
+
+**[inferred]** Functor type having member `T operator()(const T &a, const T &b)` for binary operations that defines the equality of keys
+
+
+
+**[inferred]** An integral type representing the number of input elements
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Random-access input iterator to the input sequence of key items
+
+
+
+Random-access input iterator to the input sequence of value items
+
+
+
+Random-access output iterator to the output sequence of value items
+
+
+
+Binary associative scan functor
+
+
+
+Initial value to seed the exclusive scan (and is assigned to the beginning of each segment in `d_values_out`)
+
+
+
+Total number of input items (i.e., the length of `d_keys_in` and `d_values_in`)
+
+
+
+Binary functor that defines the equality of keys. Default is cuda::std::equal_to<>{}.
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+**Example**
+
+The code snippet below illustrates the exclusive prefix min-scan-by-key of an `int` device vector
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+#include // for INT_MAX
+
+// CustomMin functor
+struct CustomMin
+{
+ template
+ __host__ __device__ __forceinline__
+ T operator()(const T &a, const T &b) const {
+ return (b < a) ? b : a;
+ }
+};
+
+// CustomEqual functor
+struct CustomEqual
+{
+ template
+ __host__ __device__ __forceinline__
+ T operator()(const T &a, const T &b) const {
+ return a == b;
+ }
+};
+
+// Declare, allocate, and initialize device-accessible pointers for
+// input and output
+int num_items; // e.g., 7
+int *d_keys_in; // e.g., [0, 0, 1, 1, 1, 2, 2]
+int *d_values_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
+int *d_values_out; // e.g., [ , , , , , , ]
+CustomMin min_op;
+CustomEqual equality_op;
+...
+
+// Determine temporary device storage requirements for exclusive
+// prefix scan
+void *d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+cub::DeviceScan::ExclusiveScanByKey(
+ d_temp_storage, temp_storage_bytes,
+ d_keys_in, d_values_in, d_values_out, min_op,
+ (int) INT_MAX, num_items, equality_op);
+
+// Allocate temporary storage for exclusive prefix scan
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Run exclusive prefix min-scan
+cub::DeviceScan::ExclusiveScanByKey(
+ d_temp_storage, temp_storage_bytes,
+ d_keys_in, d_values_in, d_values_out, min_op,
+ (int) INT_MAX, num_items, equality_op);
+
+// d_values_out <-- [2147483647, 8, 2147483647, 7, 5, 2147483647, 0]
+```
+
+### InclusiveSumByKey inline static
+
+Computes a device-wide inclusive prefix sum-by-key with key equality defined by `equality_op`.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceScan::InclusiveSumByKey(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ KeysInputIteratorT d_keys_in,
+ ValuesInputIteratorT d_values_in,
+ ValuesOutputIteratorT d_values_out,
+ NumItemsT num_items,
+ EqualityOpT equality_op = EqualityOpT(),
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+Supports non-commutative sum operators.
+Results are not deterministic for pseudo-associative operators (e.g., addition of floating-point types). Results for pseudo-associative operators may vary from run to run. Additional details can be found in the @lookback description.
+`d_keys_in` may equal `d_values_out` but the range `[d_keys_in, d_keys_in + num_items)` and the range `[d_values_out, d_values_out + num_items)` shall not overlap otherwise.
+`d_values_in` may equal `d_values_out` but the range `[d_values_in, d_values_in + num_items)` and the range `[d_values_out, d_values_out + num_items)` shall not overlap otherwise.
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access input iterator type for reading scan keys inputs (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access input iterator type for reading scan values inputs (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access output iterator type for writing scan values outputs (may be a simple pointer type)
+
+
+
+**[inferred]** Functor type having member `T operator()(const T &a, const T &b)` for binary operations that defines the equality of keys
+
+
+
+**[inferred]** An integral type representing the number of input elements
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Random-access input iterator to the input sequence of key items
+
+
+
+Random-access input iterator to the input sequence of value items
+
+
+
+Random-access output iterator to the output sequence of value items
+
+
+
+Total number of input items (i.e., the length of `d_keys_in` and `d_values_in`)
+
+
+
+Binary functor that defines the equality of keys. Default is cuda::std::equal_to<>{}.
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+**Example**
+
+The code snippet below illustrates the inclusive prefix sum-by-key of an `int` device vector.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+// Declare, allocate, and initialize device-accessible pointers for
+// input and output
+int num_items; // e.g., 7
+int *d_keys_in; // e.g., [0, 0, 1, 1, 1, 2, 2]
+int *d_values_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
+int *d_values_out; // e.g., [ , , , , , , ]
+...
+
+// Determine temporary device storage requirements for inclusive prefix sum
+void *d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+cub::DeviceScan::InclusiveSumByKey(
+ d_temp_storage, temp_storage_bytes,
+ d_keys_in, d_values_in, d_values_out, num_items);
+
+// Allocate temporary storage for inclusive prefix sum
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Run inclusive prefix sum
+cub::DeviceScan::InclusiveSumByKey(
+ d_temp_storage, temp_storage_bytes,
+ d_keys_in, d_values_in, d_values_out, num_items);
+
+// d_out <-- [8, 14, 7, 12, 15, 0, 9]
+```
+
+### InclusiveScanByKey inline static
+
+Computes a device-wide inclusive prefix scan-by-key using the specified binary associative `scan_op` functor. The key equality is defined by `equality_op`.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceScan::InclusiveScanByKey(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ KeysInputIteratorT d_keys_in,
+ ValuesInputIteratorT d_values_in,
+ ValuesOutputIteratorT d_values_out,
+ ScanOpT scan_op,
+ NumItemsT num_items,
+ EqualityOpT equality_op = EqualityOpT(),
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+Supports non-commutative scan operators.
+Results are not deterministic for pseudo-associative operators (e.g., addition of floating-point types). Results for pseudo-associative operators may vary from run to run. Additional details can be found in the @lookback description.
+`d_keys_in` may equal `d_values_out` but the range `[d_keys_in, d_keys_in + num_items)` and the range `[d_values_out, d_values_out + num_items)` shall not overlap otherwise.
+`d_values_in` may equal `d_values_out` but the range `[d_values_in, d_values_in + num_items)` and the range `[d_values_out, d_values_out + num_items)` shall not overlap otherwise.
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access input iterator type for reading scan keys inputs (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access input iterator type for reading scan values inputs (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access output iterator type for writing scan values outputs (may be a simple pointer type)
+
+
+
+**[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
+
+
+
+**[inferred]** Functor type having member `T operator()(const T &a, const T &b)` for binary operations that defines the equality of keys
+
+
+
+**[inferred]** An integral type representing the number of input elements
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Random-access input iterator to the input sequence of key items
+
+
+
+Random-access input iterator to the input sequence of value items
+
+
+
+Random-access output iterator to the output sequence of value items
+
+
+
+Binary associative scan functor
+
+
+
+Total number of input items (i.e., the length of `d_keys_in` and `d_values_in`)
+
+
+
+Binary functor that defines the equality of keys. Default is cuda::std::equal_to<>{}.
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+**Example**
+
+The code snippet below illustrates the inclusive prefix min-scan-by-key of an `int` device vector.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+#include // for INT_MAX
+
+// CustomMin functor
+struct CustomMin
+{
+ template
+ __host__ __device__ __forceinline__
+ T operator()(const T &a, const T &b) const {
+ return (b < a) ? b : a;
+ }
+};
+
+// CustomEqual functor
+struct CustomEqual
+{
+ template
+ __host__ __device__ __forceinline__
+ T operator()(const T &a, const T &b) const {
+ return a == b;
+ }
+};
+
+// Declare, allocate, and initialize device-accessible pointers for
+// input and output
+int num_items; // e.g., 7
+int *d_keys_in; // e.g., [0, 0, 1, 1, 1, 2, 2]
+int *d_values_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
+int *d_values_out; // e.g., [ , , , , , , ]
+CustomMin min_op;
+CustomEqual equality_op;
+...
+
+// Determine temporary device storage requirements for inclusive prefix scan
+void *d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+cub::DeviceScan::InclusiveScanByKey(
+ d_temp_storage, temp_storage_bytes,
+ d_keys_in, d_values_in, d_values_out, min_op, num_items, equality_op);
+
+// Allocate temporary storage for inclusive prefix scan
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Run inclusive prefix min-scan
+cub::DeviceScan::InclusiveScanByKey(
+ d_temp_storage, temp_storage_bytes,
+ d_keys_in, d_values_in, d_values_out, min_op, num_items, equality_op);
+
+// d_out <-- [8, 6, 7, 5, 3, 0, 0]
+```
diff --git a/fern/cudapages/cub/cub/cub/DeviceSegmentedRadixSort.mdx b/fern/cudapages/cub/cub/cub/DeviceSegmentedRadixSort.mdx
new file mode 100644
index 0000000..69c94eb
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/DeviceSegmentedRadixSort.mdx
@@ -0,0 +1,1260 @@
+---
+title: cub::DeviceSegmentedRadixSort
+description: ""
+---
+
+DeviceSegmentedRadixSort provides device-wide, parallel operations for computing a batched radix sort across multiple, non-overlapping sequences of data items residing within device-accessible memory.
+
+---
+
+## Key-value pairs
+
+### SortPairs inline static
+
+
+
+
+Sorts segments of key-value pairs into ascending order. (`~2N` auxiliary storage required)
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceSegmentedRadixSort::SortPairs(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ const KeyT *d_keys_in,
+ KeyT *d_keys_out,
+ const ValueT *d_values_in,
+ ValueT *d_values_out,
+ ::cuda::std::int64_t num_items,
+ ::cuda::std::int64_t num_segments,
+ BeginOffsetIteratorT d_begin_offsets,
+ EndOffsetIteratorT d_end_offsets,
+ int begin_bit = 0,
+ int end_bit = sizeof(KeyT) *8,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The contents of the input data are not altered by the sorting operation
+When input a contiguous sequence of segments, a single sequence `segment_offsets` (of length `num_segments + 1`) can be aliased for both the `d_begin_offsets` and `d_end_offsets` parameters (where the latter is specified as `segment_offsets + 1`).
+An optional bit subrange `[begin_bit, end_bit)` of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement.
+Let `in` be one of `{d_keys_in, d_values_in}` and `out` be any of `{d_keys_out, d_values_out}`. The range `[out, out + num_items)` shall not overlap `[in, in + num_items)`, `[d_begin_offsets, d_begin_offsets + num_segments)` nor `[d_end_offsets, d_end_offsets + num_segments)` in any way.
+@devicestorageNP For sorting using only `O(P)` temporary storage, see the sorting interface using DoubleBuffer wrappers below.
+Segments are not required to be contiguous. For all index values `i` outside the specified segments `d_keys_in[i]`, `d_values_in[i]`, `d_keys_out[i]`, `d_values_out[i]` will not be accessed nor modified.
+Note, the size of any segment may not exceed `INT_MAX`. Please consider using `DeviceSegmentedSort` instead, if the size of at least one of your segments could exceed `INT_MAX`.
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** Key type
+
+
+
+**[inferred]** Value type
+
+
+
+**[inferred]** Random-access input iterator type for reading segment beginning offsets (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access input iterator type for reading segment ending offsets (may be a simple pointer type)
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Device-accessible pointer to the input data of key data to sort
+
+
+
+Device-accessible pointer to the sorted output sequence of key data
+
+
+
+Device-accessible pointer to the corresponding input sequence of associated value items
+
+
+
+Device-accessible pointer to the correspondingly-reordered output sequence of associated value items
+
+
+
+The total number of items within the segmented array, including items not covered by segments. `num_items` should match the largest element within the range `[d_end_offsets, d_end_offsets + num_segments)`.
+
+
+
+The number of segments that comprise the sorting data
+
+
+
+Random-access input iterator to the sequence of beginning offsets of length `num_segments`, such that `d_begin_offsets[i]` is the first element of the *i*th data segment in `d_keys_*` and `d_values_*`
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of ending offsets of length
+//! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+//! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``. If
+//! ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
+//!
+
+
+
+**[optional]** The least-significant bit index (inclusive) needed for key comparison
+
+
+
+**[optional]** The most-significant bit index (exclusive) needed for key comparison (e.g., `sizeof(unsigned int) * 8`)
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+**Example**
+
+The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of `int` keys with associated vector of `int` values.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+// Declare, allocate, and initialize device-accessible pointers for sorting data
+int num_items; // e.g., 7
+int num_segments; // e.g., 3
+int *d_offsets; // e.g., [0, 3, 3, 7]
+int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
+int *d_keys_out; // e.g., [-, -, -, -, -, -, -]
+int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6]
+int *d_values_out; // e.g., [-, -, -, -, -, -, -]
+...
+
+// Determine temporary device storage requirements
+void *d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+cub::DeviceSegmentedRadixSort::SortPairs(
+ d_temp_storage, temp_storage_bytes,
+ d_keys_in, d_keys_out, d_values_in, d_values_out,
+ num_items, num_segments, d_offsets, d_offsets + 1);
+
+// Allocate temporary storage
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Run sorting operation
+cub::DeviceSegmentedRadixSort::SortPairs(
+ d_temp_storage, temp_storage_bytes,
+ d_keys_in, d_keys_out, d_values_in, d_values_out,
+ num_items, num_segments, d_offsets, d_offsets + 1);
+
+// d_keys_out <-- [6, 7, 8, 0, 3, 5, 9]
+// d_values_out <-- [1, 2, 0, 5, 4, 3, 6]
+```
+
+
+
+
+Sorts segments of key-value pairs into ascending order. (`~N` auxiliary storage required)
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceSegmentedRadixSort::SortPairs(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ DoubleBuffer &d_keys,
+ DoubleBuffer &d_values,
+ ::cuda::std::int64_t num_items,
+ ::cuda::std::int64_t num_segments,
+ BeginOffsetIteratorT d_begin_offsets,
+ EndOffsetIteratorT d_end_offsets,
+ int begin_bit = 0,
+ int end_bit = sizeof(KeyT) *8,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The sorting operation is given a pair of key buffers and a corresponding pair of associated value buffers. Each pair is managed by a DoubleBuffer structure that indicates which of the two buffers is "current" (and thus contains the input data to be sorted).
+The contents of both buffers within each pair may be altered by the sorting operation.
+Upon completion, the sorting operation will update the "current" indicator within each DoubleBuffer wrapper to reference which of the two buffers now contains the sorted output sequence (a function of the number of key bits specified and the targeted device architecture).
+When input a contiguous sequence of segments, a single sequence `segment_offsets` (of length `num_segments + 1`) can be aliased for both the `d_begin_offsets` and `d_end_offsets` parameters (where the latter is specified as `segment_offsets + 1`).
+An optional bit subrange `[begin_bit, end_bit)` of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement.
+Let `cur` be one of `{d_keys.Current(), d_values.Current()}` and `alt` be any of `{d_keys.Alternate(), d_values.Alternate()}`. The range `[cur, cur + num_items)` shall not overlap `[alt, alt + num_items)`. Both ranges shall not overlap `[d_begin_offsets, d_begin_offsets + num_segments)` nor `[d_end_offsets, d_end_offsets + num_segments)` in any way.
+Segments are not required to be contiguous. For all index values `i` outside the specified segments `d_keys.Current()[i]`, `d_values.Current()[i]`, `d_keys.Alternate()[i]`, `d_values.Alternate()[i]` will not be accessed nor modified.
+Note, the size of any segment may not exceed `INT_MAX`. Please consider using `DeviceSegmentedSort` instead, if the size of at least one of your segments could exceed `INT_MAX`.
+@devicestorageP
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** Key type
+
+
+
+**[inferred]** Value type
+
+
+
+**[inferred]** Random-access input iterator type for reading segment beginning offsets (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access input iterator type for reading segment ending offsets (may be a simple pointer type)
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+
+
+
+Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
+
+
+
+The total number of items within the segmented array, including items not covered by segments. `num_items` should match the largest element within the range `[d_end_offsets, d_end_offsets + num_segments)`.
+
+
+
+The number of segments that comprise the sorting data
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of beginning offsets of
+//! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
+//! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
+//!
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of ending offsets of length
+//! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+//! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+//! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
+//!
+
+
+
+**[optional]** The least-significant bit index (inclusive) needed for key comparison
+
+
+
+**[optional]** The most-significant bit index (exclusive) needed for key comparison (e.g., `sizeof(unsigned int) * 8`)
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+**Example**
+
+The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of `int` keys with associated vector of `int` values.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+// Declare, allocate, and initialize device-accessible pointers
+// for sorting data
+int num_items; // e.g., 7
+int num_segments; // e.g., 3
+int *d_offsets; // e.g., [0, 3, 3, 7]
+int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9]
+int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -]
+int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6]
+int *d_value_alt_buf; // e.g., [-, -, -, -, -, -, -]
+...
+
+// Create a set of DoubleBuffers to wrap pairs of device pointers
+cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf);
+cub::DoubleBuffer d_values(d_value_buf, d_value_alt_buf);
+
+// Determine temporary device storage requirements
+void *d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+cub::DeviceSegmentedRadixSort::SortPairs(
+ d_temp_storage, temp_storage_bytes, d_keys, d_values,
+ num_items, num_segments, d_offsets, d_offsets + 1);
+
+// Allocate temporary storage
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Run sorting operation
+cub::DeviceSegmentedRadixSort::SortPairs(
+ d_temp_storage, temp_storage_bytes, d_keys, d_values,
+ num_items, num_segments, d_offsets, d_offsets + 1);
+
+// d_keys.Current() <-- [6, 7, 8, 0, 3, 5, 9]
+// d_values.Current() <-- [5, 4, 3, 1, 2, 0, 6]
+```
+
+
+
+
+### SortPairsDescending inline static
+
+
+
+
+Sorts segments of key-value pairs into descending order. (`~2N` auxiliary storage required).
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceSegmentedRadixSort::SortPairsDescending(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ const KeyT *d_keys_in,
+ KeyT *d_keys_out,
+ const ValueT *d_values_in,
+ ValueT *d_values_out,
+ ::cuda::std::int64_t num_items,
+ ::cuda::std::int64_t num_segments,
+ BeginOffsetIteratorT d_begin_offsets,
+ EndOffsetIteratorT d_end_offsets,
+ int begin_bit = 0,
+ int end_bit = sizeof(KeyT) *8,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The contents of the input data are not altered by the sorting operation
+When input a contiguous sequence of segments, a single sequence `segment_offsets` (of length `num_segments + 1`) can be aliased for both the `d_begin_offsets` and `d_end_offsets` parameters (where the latter is specified as `segment_offsets + 1`).
+An optional bit subrange `[begin_bit, end_bit)` of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement.
+Let `in` be one of `{d_keys_in, d_values_in}` and `out` be any of `{d_keys_out, d_values_out}`. The range `[out, out + num_items)` shall not overlap `[in, in + num_items)`, `[d_begin_offsets, d_begin_offsets + num_segments)` nor `[d_end_offsets, d_end_offsets + num_segments)` in any way.
+@devicestorageNP For sorting using only `O(P)` temporary storage, see the sorting interface using DoubleBuffer wrappers below.
+Segments are not required to be contiguous. For all index values `i` outside the specified segments `d_keys_in[i]`, `d_values_in[i]`, `d_keys_out[i]`, `d_values_out[i]` will not be accessed nor modified.
+Note, the size of any segment may not exceed `INT_MAX`. Please consider using `DeviceSegmentedSort` instead, if the size of at least one of your segments could exceed `INT_MAX`.
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** Key type
+
+
+
+**[inferred]** Value type
+
+
+
+**[inferred]** Random-access input iterator type for reading segment beginning offsets (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access input iterator type for reading segment ending offsets (may be a simple pointer type)
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Device-accessible pointer to the input data of key data to sort
+
+
+
+Device-accessible pointer to the sorted output sequence of key data
+
+
+
+Device-accessible pointer to the corresponding input sequence of associated value items
+
+
+
+Device-accessible pointer to the correspondingly-reordered output sequence of associated value items
+
+
+
+The total number of items within the segmented array, including items not covered by segments. `num_items` should match the largest element within the range `[d_end_offsets, d_end_offsets + num_segments)`.
+
+
+
+The number of segments that comprise the sorting data
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of beginning offsets of
+//! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
+//! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
+//!
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of ending offsets of length
+//! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+//! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+//! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
+//!
+
+
+
+**[optional]** The least-significant bit index (inclusive) needed for key comparison
+
+
+
+**[optional]** The most-significant bit index (exclusive) needed for key comparison (e.g., `sizeof(unsigned int) * 8`)
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+**Example**
+
+The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of `int` keys with associated vector of `int` values.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+// Declare, allocate, and initialize device-accessible pointers
+// for sorting data
+int num_items; // e.g., 7
+int num_segments; // e.g., 3
+int *d_offsets; // e.g., [0, 3, 3, 7]
+int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
+int *d_keys_out; // e.g., [-, -, -, -, -, -, -]
+int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6]
+int *d_values_out; // e.g., [-, -, -, -, -, -, -]
+...
+
+// Determine temporary device storage requirements
+void *d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+cub::DeviceSegmentedRadixSort::SortPairsDescending(
+ d_temp_storage, temp_storage_bytes,
+ d_keys_in, d_keys_out, d_values_in, d_values_out,
+ num_items, num_segments, d_offsets, d_offsets + 1);
+
+// Allocate temporary storage
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Run sorting operation
+cub::DeviceSegmentedRadixSort::SortPairsDescending(
+ d_temp_storage, temp_storage_bytes,
+ d_keys_in, d_keys_out, d_values_in, d_values_out,
+ num_items, num_segments, d_offsets, d_offsets + 1);
+
+// d_keys_out <-- [8, 7, 6, 9, 5, 3, 0]
+// d_values_out <-- [0, 2, 1, 6, 3, 4, 5]
+```
+
+
+
+
+Sorts segments of key-value pairs into descending order. (`~N` auxiliary storage required).
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceSegmentedRadixSort::SortPairsDescending(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ DoubleBuffer &d_keys,
+ DoubleBuffer &d_values,
+ ::cuda::std::int64_t num_items,
+ ::cuda::std::int64_t num_segments,
+ BeginOffsetIteratorT d_begin_offsets,
+ EndOffsetIteratorT d_end_offsets,
+ int begin_bit = 0,
+ int end_bit = sizeof(KeyT) *8,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The sorting operation is given a pair of key buffers and a corresponding pair of associated value buffers. Each pair is managed by a DoubleBuffer structure that indicates which of the two buffers is "current" (and thus contains the input data to be sorted).
+The contents of both buffers within each pair may be altered by the sorting operation.
+Upon completion, the sorting operation will update the "current" indicator within each DoubleBuffer wrapper to reference which of the two buffers now contains the sorted output sequence (a function of the number of key bits specified and the targeted device architecture).
+When input a contiguous sequence of segments, a single sequence `segment_offsets` (of length `num_segments + 1`) can be aliased for both the `d_begin_offsets` and `d_end_offsets` parameters (where the latter is specified as `segment_offsets + 1`).
+An optional bit subrange `[begin_bit, end_bit)` of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement.
+Let `cur` be one of `{d_keys.Current(), d_values.Current()}` and `alt` be any of `{d_keys.Alternate(), d_values.Alternate()}`. The range `[cur, cur + num_items)` shall not overlap `[alt, alt + num_items)`. Both ranges shall not overlap `[d_begin_offsets, d_begin_offsets + num_segments)` nor `[d_end_offsets, d_end_offsets + num_segments)` in any way.
+Segments are not required to be contiguous. For all index values `i` outside the specified segments `d_keys.Current()[i]`, `d_values.Current()[i]`, `d_keys.Alternate()[i]`, `d_values.Alternate()[i]` will not be accessed nor modified. not to be modified.
+Note, the size of any segment may not exceed `INT_MAX`. Please consider using `DeviceSegmentedSort` instead, if the size of at least one of your segments could exceed `INT_MAX`.
+@devicestorageP
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** Key type
+
+
+
+**[inferred]** Value type
+
+
+
+**[inferred]** Random-access input iterator type for reading segment beginning offsets (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access input iterator type for reading segment ending offsets (may be a simple pointer type)
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+
+
+
+Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
+
+
+
+The total number of items within the segmented array, including items not covered by segments. `num_items` should match the largest element within the range `[d_end_offsets, d_end_offsets + num_segments)`.
+
+
+
+The number of segments that comprise the sorting data
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of beginning offsets of
+//! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
+//! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
+//!
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of ending offsets of length
+//! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+//! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+//! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
+//!
+
+
+
+**[optional]** The least-significant bit index (inclusive) needed for key comparison
+
+
+
+**[optional]** The most-significant bit index (exclusive) needed for key comparison (e.g., `sizeof(unsigned int) * 8`)
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+**Example**
+
+The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of `int` keys with associated vector of `int` values.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+// Declare, allocate, and initialize device-accessible pointers
+// for sorting data
+int num_items; // e.g., 7
+int num_segments; // e.g., 3
+int *d_offsets; // e.g., [0, 3, 3, 7]
+int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9]
+int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -]
+int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6]
+int *d_value_alt_buf; // e.g., [-, -, -, -, -, -, -]
+...
+
+// Create a set of DoubleBuffers to wrap pairs of device pointers
+cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf);
+cub::DoubleBuffer d_values(d_value_buf, d_value_alt_buf);
+
+// Determine temporary device storage requirements
+void *d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+cub::DeviceSegmentedRadixSort::SortPairsDescending(
+ d_temp_storage, temp_storage_bytes, d_keys, d_values,
+ num_items, num_segments, d_offsets, d_offsets + 1);
+
+// Allocate temporary storage
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Run sorting operation
+cub::DeviceSegmentedRadixSort::SortPairsDescending(
+ d_temp_storage, temp_storage_bytes, d_keys, d_values,
+ num_items, num_segments, d_offsets, d_offsets + 1);
+
+// d_keys.Current() <-- [8, 7, 6, 9, 5, 3, 0]
+// d_values.Current() <-- [0, 2, 1, 6, 3, 4, 5]
+```
+
+
+
+
+---
+
+## Keys-only
+
+### SortKeys inline static
+
+
+
+
+Sorts segments of keys into ascending order. (`~2N` auxiliary storage required)
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceSegmentedRadixSort::SortKeys(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ const KeyT *d_keys_in,
+ KeyT *d_keys_out,
+ ::cuda::std::int64_t num_items,
+ ::cuda::std::int64_t num_segments,
+ BeginOffsetIteratorT d_begin_offsets,
+ EndOffsetIteratorT d_end_offsets,
+ int begin_bit = 0,
+ int end_bit = sizeof(KeyT) *8,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The contents of the input data are not altered by the sorting operation
+An optional bit subrange `[begin_bit, end_bit)` of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement.
+When input a contiguous sequence of segments, a single sequence `segment_offsets` (of length `num_segments + 1`) can be aliased for both the `d_begin_offsets` and `d_end_offsets` parameters (where the latter is specified as `segment_offsets + 1`).
+The range `[d_keys_out, d_keys_out + num_items)` shall not overlap `[d_keys_in, d_keys_in + num_items)`, `[d_begin_offsets, d_begin_offsets + num_segments)` nor `[d_end_offsets, d_end_offsets + num_segments)` in any way.
+@devicestorageNP For sorting using only `O(P)` temporary storage, see the sorting interface using DoubleBuffer wrappers below.
+Segments are not required to be contiguous. For all index values `i` outside the specified segments `d_keys_in[i]`, `d_keys_out[i]` will not be accessed nor modified.
+Note, the size of any segment may not exceed `INT_MAX`. Please consider using `DeviceSegmentedSort` instead, if the size of at least one of your segments could exceed `INT_MAX`.
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** Key type
+
+
+
+**[inferred]** Random-access input iterator type for reading segment beginning offsets (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access input iterator type for reading segment ending offsets (may be a simple pointer type)
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Device-accessible pointer to the input data of key data to sort
+
+
+
+Device-accessible pointer to the sorted output sequence of key data
+
+
+
+The total number of items within the segmented array, including items not covered by segments. `num_items` should match the largest element within the range `[d_end_offsets, d_end_offsets + num_segments)`.
+
+
+
+The number of segments that comprise the sorting data
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of beginning offsets of
+//! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
+//! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
+//!
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of ending offsets of length
+//! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+//! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+//! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
+//!
+
+
+
+**[optional]** The least-significant bit index (inclusive) needed for key comparison
+
+
+
+**[optional]** The most-significant bit index (exclusive) needed for key comparison (e.g., `sizeof(unsigned int) * 8`)
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+**Example**
+
+The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of `int` keys.
+
+```cpp showLineNumbers={false}
+#include
+// or equivalently
+
+// Declare, allocate, and initialize device-accessible pointers
+// for sorting data
+int num_items; // e.g., 7
+int num_segments; // e.g., 3
+int *d_offsets; // e.g., [0, 3, 3, 7]
+int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
+int *d_keys_out; // e.g., [-, -, -, -, -, -, -]
+...
+
+// Determine temporary device storage requirements
+void *d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+cub::DeviceSegmentedRadixSort::SortKeys(
+ d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
+ num_items, num_segments, d_offsets, d_offsets + 1);
+
+// Allocate temporary storage
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Run sorting operation
+cub::DeviceSegmentedRadixSort::SortKeys(
+ d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
+ num_items, num_segments, d_offsets, d_offsets + 1);
+
+// d_keys_out <-- [6, 7, 8, 0, 3, 5, 9]
+```
+
+
+
+
+Sorts segments of keys into ascending order. (`~N` auxiliary storage required).
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceSegmentedRadixSort::SortKeys(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ DoubleBuffer &d_keys,
+ ::cuda::std::int64_t num_items,
+ ::cuda::std::int64_t num_segments,
+ BeginOffsetIteratorT d_begin_offsets,
+ EndOffsetIteratorT d_end_offsets,
+ int begin_bit = 0,
+ int end_bit = sizeof(KeyT) *8,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The sorting operation is given a pair of key buffers managed by a DoubleBuffer structure that indicates which of the two buffers is "current" (and thus contains the input data to be sorted).
+The contents of both buffers may be altered by the sorting operation.
+Upon completion, the sorting operation will update the "current" indicator within the DoubleBuffer wrapper to reference which of the two buffers now contains the sorted output sequence (a function of the number of key bits specified and the targeted device architecture).
+When input a contiguous sequence of segments, a single sequence `segment_offsets` (of length `num_segments + 1`) can be aliased for both the `d_begin_offsets` and `d_end_offsets` parameters (where the latter is specified as `segment_offsets + 1`).
+An optional bit subrange `[begin_bit, end_bit)` of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement.
+Let `cur = d_keys.Current()` and `alt = d_keys.Alternate()`. The range `[cur, cur + num_items)` shall not overlap `[alt, alt + num_items)`. Both ranges shall not overlap `[d_begin_offsets, d_begin_offsets + num_segments)` nor `[d_end_offsets, d_end_offsets + num_segments)` in any way.
+Segments are not required to be contiguous. For all index values `i` outside the specified segments `d_keys.Current()[i]`, `d_keys[i].Alternate()[i]` will not be accessed nor modified.
+Note, the size of any segment may not exceed `INT_MAX`. Please consider using `DeviceSegmentedSort` instead, if the size of at least one of your segments could exceed `INT_MAX`.
+@devicestorageP
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** Key type
+
+
+
+**[inferred]** Random-access input iterator type for reading segment beginning offsets (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access input iterator type for reading segment ending offsets (may be a simple pointer type)
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+
+
+
+The total number of items within the segmented array, including items not covered by segments. `num_items` should match the largest element within the range `[d_end_offsets, d_end_offsets + num_segments)`.
+
+
+
+The number of segments that comprise the sorting data
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of beginning offsets of
+//! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
+//! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
+//!
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of ending offsets of length
+//! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+//! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+//! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
+//!
+
+
+
+**[optional]** The least-significant bit index (inclusive) needed for key comparison
+
+
+
+**[optional]** The most-significant bit index (exclusive) needed for key comparison (e.g., `sizeof(unsigned int) * 8`)
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+**Example**
+
+The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of `int` keys.
+
+```cpp showLineNumbers={false}
+#include
+// or equivalently
+
+// Declare, allocate, and initialize device-accessible pointers for
+// sorting data
+int num_items; // e.g., 7
+int num_segments; // e.g., 3
+int *d_offsets; // e.g., [0, 3, 3, 7]
+int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9]
+int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -]
+...
+
+// Create a DoubleBuffer to wrap the pair of device pointers
+cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf);
+
+// Determine temporary device storage requirements
+void *d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+cub::DeviceSegmentedRadixSort::SortKeys(
+ d_temp_storage, temp_storage_bytes, d_keys,
+ num_items, num_segments, d_offsets, d_offsets + 1);
+
+// Allocate temporary storage
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Run sorting operation
+cub::DeviceSegmentedRadixSort::SortKeys(
+ d_temp_storage, temp_storage_bytes, d_keys,
+ num_items, num_segments, d_offsets, d_offsets + 1);
+
+// d_keys.Current() <-- [6, 7, 8, 0, 3, 5, 9]
+```
+
+
+
+
+### SortKeysDescending inline static
+
+
+
+
+Sorts segments of keys into descending order. (`~2N` auxiliary storage required).
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceSegmentedRadixSort::SortKeysDescending(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ const KeyT *d_keys_in,
+ KeyT *d_keys_out,
+ ::cuda::std::int64_t num_items,
+ ::cuda::std::int64_t num_segments,
+ BeginOffsetIteratorT d_begin_offsets,
+ EndOffsetIteratorT d_end_offsets,
+ int begin_bit = 0,
+ int end_bit = sizeof(KeyT) *8,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The contents of the input data are not altered by the sorting operation
+When input a contiguous sequence of segments, a single sequence `segment_offsets` (of length `num_segments + 1`) can be aliased for both the `d_begin_offsets` and `d_end_offsets` parameters (where the latter is specified as `segment_offsets + 1`).
+An optional bit subrange `[begin_bit, end_bit)` of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement.
+The range `[d_keys_out, d_keys_out + num_items)` shall not overlap `[d_keys_in, d_keys_in + num_items)`, `[d_begin_offsets, d_begin_offsets + num_segments)` nor `[d_end_offsets, d_end_offsets + num_segments)` in any way.
+@devicestorageNP For sorting using only `O(P)` temporary storage, see the sorting interface using DoubleBuffer wrappers below.
+Segments are not required to be contiguous. For all index values `i` outside the specified segments `d_keys_in[i]`, `d_keys_out[i]` will not be accessed nor modified.
+Note, the size of any segment may not exceed `INT_MAX`. Please consider using `DeviceSegmentedSort` instead, if the size of at least one of your segments could exceed `INT_MAX`.
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** Key type
+
+
+
+**[inferred]** Random-access input iterator type for reading segment beginning offsets (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access input iterator type for reading segment ending offsets (may be a simple pointer type)
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Device-accessible pointer to the input data of key data to sort
+
+
+
+Device-accessible pointer to the sorted output sequence of key data
+
+
+
+The total number of items within the segmented array, including items not covered by segments. `num_items` should match the largest element within the range `[d_end_offsets, d_end_offsets + num_segments)`.
+
+
+
+The number of segments that comprise the sorting data
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of beginning offsets of
+//! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
+//! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
+//!
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of ending offsets of length
+//! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+//! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+//! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
+//!
+
+
+
+**[optional]** The least-significant bit index (inclusive) needed for key comparison
+
+
+
+**[optional]** The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+**Example**
+
+The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of `int` keys.
+
+```cpp showLineNumbers={false}
+#include
+// or equivalently
+
+// Declare, allocate, and initialize device-accessible pointers
+// for sorting data
+int num_items; // e.g., 7
+int num_segments; // e.g., 3
+int *d_offsets; // e.g., [0, 3, 3, 7]
+int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
+int *d_keys_out; // e.g., [-, -, -, -, -, -, -]
+...
+
+// Create a DoubleBuffer to wrap the pair of device pointers
+cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf);
+
+// Determine temporary device storage requirements
+void *d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+cub::DeviceSegmentedRadixSort::SortKeysDescending(
+ d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
+ num_items, num_segments, d_offsets, d_offsets + 1);
+
+// Allocate temporary storage
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Run sorting operation
+cub::DeviceSegmentedRadixSort::SortKeysDescending(
+ d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
+ num_items, num_segments, d_offsets, d_offsets + 1);
+
+// d_keys_out <-- [8, 7, 6, 9, 5, 3, 0]
+```
+
+
+
+
+Sorts segments of keys into descending order. (`~N` auxiliary storage required).
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceSegmentedRadixSort::SortKeysDescending(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ DoubleBuffer &d_keys,
+ ::cuda::std::int64_t num_items,
+ ::cuda::std::int64_t num_segments,
+ BeginOffsetIteratorT d_begin_offsets,
+ EndOffsetIteratorT d_end_offsets,
+ int begin_bit = 0,
+ int end_bit = sizeof(KeyT) *8,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The sorting operation is given a pair of key buffers managed by a DoubleBuffer structure that indicates which of the two buffers is "current" (and thus contains the input data to be sorted).
+The contents of both buffers may be altered by the sorting operation.
+Upon completion, the sorting operation will update the "current" indicator within the DoubleBuffer wrapper to reference which of the two buffers now contains the sorted output sequence (a function of the number of key bits specified and the targeted device architecture).
+When input a contiguous sequence of segments, a single sequence `segment_offsets` (of length `num_segments + 1`) can be aliased for both the `d_begin_offsets` and `d_end_offsets` parameters (where the latter is specified as `segment_offsets + 1`).
+An optional bit subrange `[begin_bit, end_bit)` of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement.
+Let `cur = d_keys.Current()` and `alt = d_keys.Alternate()`. The range `[cur, cur + num_items)` shall not overlap `[alt, alt + num_items)`. Both ranges shall not overlap `[d_begin_offsets, d_begin_offsets + num_segments)` nor `[d_end_offsets, d_end_offsets + num_segments)` in any way.
+Segments are not required to be contiguous. For all index values `i` outside the specified segments `d_keys.Current()[i]`, `d_keys[i].Alternate()[i]` will not be accessed nor modified.
+Note, the size of any segment may not exceed `INT_MAX`. Please consider using `DeviceSegmentedSort` instead, if the size of at least one of your segments could exceed `INT_MAX`.
+@devicestorageP
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** Key type
+
+
+
+**[inferred]** Random-access input iterator type for reading segment beginning offsets (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access input iterator type for reading segment ending offsets (may be a simple pointer type)
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+
+
+
+The total number of items within the segmented array, including items not covered by segments. `num_items` should match the largest element within the range `[d_end_offsets, d_end_offsets + num_segments)`.
+
+
+
+The number of segments that comprise the sorting data
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of beginning offsets of
+//! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
+//! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
+//!
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of ending offsets of length
+//! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+//! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+//! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
+//!
+
+
+
+**[optional]** The least-significant bit index (inclusive) needed for key comparison
+
+
+
+**[optional]** The most-significant bit index (exclusive) needed for key comparison (e.g., `sizeof(unsigned int) * 8`)
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+**Example**
+
+The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of `int` keys.
+
+```cpp showLineNumbers={false}
+#include
+// or equivalently
+
+// Declare, allocate, and initialize device-accessible pointers
+// for sorting data
+int num_items; // e.g., 7
+int num_segments; // e.g., 3
+int *d_offsets; // e.g., [0, 3, 3, 7]
+int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9]
+int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -]
+...
+
+// Create a DoubleBuffer to wrap the pair of device pointers
+cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf);
+
+// Determine temporary device storage requirements
+void *d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+cub::DeviceSegmentedRadixSort::SortKeysDescending(
+ d_temp_storage, temp_storage_bytes, d_keys,
+ num_items, num_segments, d_offsets, d_offsets + 1);
+
+// Allocate temporary storage
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Run sorting operation
+cub::DeviceSegmentedRadixSort::SortKeysDescending(
+ d_temp_storage, temp_storage_bytes, d_keys,
+ num_items, num_segments, d_offsets, d_offsets + 1);
+
+// d_keys.Current() <-- [8, 7, 6, 9, 5, 3, 0]
+```
+
+
+
+
+---
+
+## Utility methods
+
+### GetName inline static constexpr
+
+
+```cpp showLineNumbers={false}
+static constexpr const char * cub::DeviceSegmentedRadixSort::GetName()
+```
+
diff --git a/fern/cudapages/cub/cub/cub/DeviceSegmentedReduce.mdx b/fern/cudapages/cub/cub/cub/DeviceSegmentedReduce.mdx
new file mode 100644
index 0000000..9ec2643
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/DeviceSegmentedReduce.mdx
@@ -0,0 +1,1181 @@
+---
+title: cub::DeviceSegmentedReduce
+description: ""
+---
+
+DeviceSegmentedReduce provides device-wide, parallel operations for computing a reduction across multiple sequences of data items residing within device-accessible memory.
+
+---
+
+## Static methods
+
+### Reduce inline static
+
+
+
+
+Computes a device-wide segmented reduction using the specified binary `reduction_op` functor.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceSegmentedReduce::Reduce(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ InputIteratorT d_in,
+ OutputIteratorT d_out,
+ ::cuda::std::int64_t num_segments,
+ BeginOffsetIteratorT d_begin_offsets,
+ EndOffsetIteratorT d_end_offsets,
+ ReductionOpT reduction_op,
+ T initial_value,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+Does not support binary reduction operators that are non-commutative.
+Provides "run-to-run" determinism for pseudo-associative reduction (e.g., addition of floating point types) on the same GPU device. However, results for pseudo-associative reduction may be inconsistent from one device to a another device of a different compute-capability because CUB can employ different tile-sizing for different architectures.
+When input a contiguous sequence of segments, a single sequence `segment_offsets` (of length `num_segments + 1`) can be aliased for both the `d_begin_offsets` and `d_end_offsets` parameters (where the latter is specified as `segment_offsets + 1`).
+Let `s` be in `[0, num_segments)`. The range `[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])` shall not overlap `[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])`, `[d_begin_offsets, d_begin_offsets + num_segments)` nor `[d_end_offsets, d_end_offsets + num_segments)`.
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access input iterator type for reading input items (may be a simple pointer type)
+
+
+
+**[inferred]** Output iterator type for recording the reduced aggregate (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access input iterator type for reading segment beginning offsets (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access input iterator type for reading segment ending offsets (may be a simple pointer type)
+
+
+
+**[inferred]** Binary reduction functor type having member `T operator()(const T &a, const T &b)`
+
+
+
+**[inferred]** Data element type that is convertible to the `value` type of `InputIteratorT`
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Pointer to the input sequence of data items
+
+
+
+Pointer to the output aggregate
+
+
+
+The number of segments that comprise the segmented reduction data
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of beginning offsets of
+//! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
+//! element of the *i*\ :sup:`th` data segment in ``d_in``
+//!
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of ending offsets of length
+//! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+//! the *i*\ :sup:`th` data segment in ``d_in``.
+//! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
+//!
+
+
+
+Binary reduction functor
+
+
+
+Initial value of the reduction for each segment
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+
+
+
+Computes a device-wide segmented reduction using the specified binary `reduction_op` functor and a fixed segment size.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceSegmentedReduce::Reduce(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ InputIteratorT d_in,
+ OutputIteratorT d_out,
+ ::cuda::std::int64_t num_segments,
+ int segment_size,
+ ReductionOpT reduction_op,
+ T initial_value,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v3.2.0. First appears in CUDA Toolkit 13.2.*
+
+
+Does not support binary reduction operators that are non-commutative.
+Provides "run-to-run" determinism for pseudo-associative reduction (e.g., addition of floating point types) on the same GPU device. However, results for pseudo-associative reduction may be inconsistent from one device to a another device of a different compute-capability because CUB can employ different tile-sizing for different architectures.
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access input iterator type for reading input items (may be a simple pointer type)
+
+
+
+**[inferred]** Output iterator type for recording the reduced aggregate (may be a simple pointer type)
+
+
+
+**[inferred]** Binary reduction functor type having member `T operator()(const T &a, const T &b)`
+
+
+
+**[inferred]** Data element type that is convertible to the `value` type of `InputIteratorT`
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Pointer to the input sequence of data items
+
+
+
+Pointer to the output aggregates
+
+
+
+The number of segments that comprise the segmented reduction data
+
+
+
+The fixed segment size of each segment
+
+
+
+Binary reduction functor
+
+
+
+Initial value of the reduction for each segment
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+
+
+
+### Sum inline static
+
+
+
+
+Computes a device-wide segmented sum using the addition (`+`) operator.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceSegmentedReduce::Sum(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ InputIteratorT d_in,
+ OutputIteratorT d_out,
+ ::cuda::std::int64_t num_segments,
+ BeginOffsetIteratorT d_begin_offsets,
+ EndOffsetIteratorT d_end_offsets,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+Uses `0` as the initial value of the reduction for each segment.
+When input a contiguous sequence of segments, a single sequence `segment_offsets` (of length `num_segments + 1`) can be aliased for both the `d_begin_offsets` and `d_end_offsets` parameters (where the latter is specified as `segment_offsets + 1`).
+Does not support `+` operators that are non-commutative.
+Let `s` be in `[0, num_segments)`. The range `[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])` shall not overlap `[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])`, `[d_begin_offsets, d_begin_offsets + num_segments)` nor `[d_end_offsets, d_end_offsets + num_segments)`.
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access input iterator type for reading input items (may be a simple pointer type)
+
+
+
+**[inferred]** Output iterator type for recording the reduced aggregate (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access input iterator type for reading segment beginning offsets (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access input iterator type for reading segment ending offsets (may be a simple pointer type)
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Pointer to the input sequence of data items
+
+
+
+Pointer to the output aggregate
+
+
+
+The number of segments that comprise the segmented reduction data
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of beginning offsets of
+//! length ``num_segments`, such that ``d_begin_offsets[i]`` is the first
+//! element of the *i*\ :sup:`th` data segment in ``d_in``
+//!
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of ending offsets of length
+//! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+//! the *i*\ :sup:`th` data segment in ``d_in``.
+//! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
+//!
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+
+
+
+Computes a device-wide segmented sum using the addition (`+`) operator.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceSegmentedReduce::Sum(
+ InputIteratorT d_in,
+ OutputIteratorT d_out,
+ ::cuda::std::int64_t num_segments,
+ BeginOffsetIteratorT d_begin_offsets,
+ EndOffsetIteratorT d_end_offsets,
+ EnvT env = {}
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+Uses `0` as the initial value of the reduction for each segment.
+When input a contiguous sequence of segments, a single sequence `segment_offsets` (of length `num_segments + 1`) can be aliased for both the `d_begin_offsets` and `d_end_offsets` parameters (where the latter is specified as `segment_offsets + 1`).
+Does not support `+` operators that are non-commutative.
+Let `s` be in `[0, num_segments)`. The range `[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])` shall not overlap `[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])`, `[d_begin_offsets, d_begin_offsets + num_segments)` nor `[d_end_offsets, d_end_offsets + num_segments)`.
+Can use a specific stream or cuda memory resource through the `env` parameter
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access input iterator type for reading input items (may be a simple pointer type)
+
+
+
+**[inferred]** Output iterator type for recording the reduced aggregate (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access input iterator type for reading segment beginning offsets (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access input iterator type for reading segment ending offsets (may be a simple pointer type)
+
+
+
+**[inferred]** Execution environment type. Default is `cuda::std::execution::env<>`.
+
+
+**Parameters**
+
+
+Pointer to the input sequence of data items
+
+
+
+Pointer to the output aggregate
+
+
+
+The number of segments that comprise the segmented reduction data
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of beginning offsets of
+//! length ``num_segments`, such that ``d_begin_offsets[i]`` is the first
+//! element of the *i*\ :sup:`th` data segment in ``d_in``
+//!
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of ending offsets of length
+//! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+//! the *i*\ :sup:`th` data segment in ``d_in``.
+//! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
+//!
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``.
+//!
+
+
+
+
+
+Computes a device-wide segmented sum using the addition (`+`) operator.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceSegmentedReduce::Sum(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ InputIteratorT d_in,
+ OutputIteratorT d_out,
+ ::cuda::std::int64_t num_segments,
+ int segment_size,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v3.2.0. First appears in CUDA Toolkit 13.2.*
+
+
+Uses `0` as the initial value of the reduction for each segment.
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access input iterator type for reading input items (may be a simple pointer type)
+
+
+
+**[inferred]** Output iterator type for recording the reduced aggregate (may be a simple pointer type)
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Pointer to the input sequence of data items
+
+
+
+Pointer to the output aggregate
+
+
+
+The number of segments that comprise the segmented reduction data
+
+
+
+The fixed segment size of each segment
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+
+
+
+### Min inline static
+
+
+
+
+Computes a device-wide segmented minimum using the less-than (`<`) operator.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceSegmentedReduce::Min(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ InputIteratorT d_in,
+ OutputIteratorT d_out,
+ ::cuda::std::int64_t num_segments,
+ BeginOffsetIteratorT d_begin_offsets,
+ EndOffsetIteratorT d_end_offsets,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+Uses `::cuda::std::numeric_limits::max()` as the initial value of the reduction for each segment.
+When input a contiguous sequence of segments, a single sequence `segment_offsets` (of length `num_segments + 1`) can be aliased for both the `d_begin_offsets` and `d_end_offsets` parameters (where the latter is specified as `segment_offsets + 1`).
+Does not support `<` operators that are non-commutative.
+Let `s` be in `[0, num_segments)`. The range `[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])` shall not overlap `[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])`, `[d_begin_offsets, d_begin_offsets + num_segments)` nor `[d_end_offsets, d_end_offsets + num_segments)`.
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access input iterator type for reading input items (may be a simple pointer type)
+
+
+
+**[inferred]** Output iterator type for recording the reduced aggregate (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access input iterator type for reading segment beginning offsets (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access input iterator type for reading segment ending offsets (may be a simple pointer type)
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Pointer to the input sequence of data items
+
+
+
+Pointer to the output aggregate
+
+
+
+The number of segments that comprise the segmented reduction data
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of beginning offsets of
+//! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
+//! element of the *i*\ :sup:`th` data segment in ``d_in``
+//!
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of ending offsets of length
+//! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+//! the *i*\ :sup:`th` data segment in ``d_in``.
+//! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
+//!
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+
+
+
+Computes a device-wide segmented minimum using the less-than (`<`) operator.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceSegmentedReduce::Min(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ InputIteratorT d_in,
+ OutputIteratorT d_out,
+ ::cuda::std::int64_t num_segments,
+ int segment_size,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v3.2.0. First appears in CUDA Toolkit 13.2.*
+
+
+Uses `::cuda::std::numeric_limits::max()` as the initial value of the reduction for each segment.
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access input iterator type for reading input items (may be a simple pointer type)
+
+
+
+**[inferred]** Output iterator type for recording the reduced aggregate (may be a simple pointer type)
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Pointer to the input sequence of data items
+
+
+
+Pointer to the output aggregate
+
+
+
+The number of segments that comprise the segmented reduction data
+
+
+
+The fixed segment size of each segment
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+
+
+
+### ArgMin inline static
+
+
+
+
+Finds the first device-wide minimum in each segment using the less-than (`<`) operator, also returning the in-segment index of that item.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceSegmentedReduce::ArgMin(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ InputIteratorT d_in,
+ OutputIteratorT d_out,
+ ::cuda::std::int64_t num_segments,
+ BeginOffsetIteratorT d_begin_offsets,
+ EndOffsetIteratorT d_end_offsets,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The output value type of `d_out` is `cub::KeyValuePair` (assuming the value type of `d_in` is `T`)
+The minimum of the *i*th segment is written to `d_out[i].value` and its offset in that segment is written to `d_out[i].key`.
+The `{1, ::cuda::std::numeric_limits::max()}` tuple is produced for zero-length inputs
+When input a contiguous sequence of segments, a single sequence `segment_offsets` (of length `num_segments + 1`) can be aliased for both the `d_begin_offsets` and `d_end_offsets` parameters (where the latter is specified as `segment_offsets + 1`).
+Does not support `<` operators that are non-commutative.
+Let `s` be in `[0, num_segments)`. The range `[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])` shall not overlap `[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])`, `[d_begin_offsets, d_begin_offsets + num_segments)` nor `[d_end_offsets, d_end_offsets + num_segments)`.
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access input iterator type for reading input items (of some type `T`) (may be a simple pointer type)
+
+
+
+**[inferred]** Output iterator type for recording the reduced aggregate (having value type `KeyValuePair`) (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access input iterator type for reading segment beginning offsets (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access input iterator type for reading segment ending offsets (may be a simple pointer type)
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Pointer to the input sequence of data items
+
+
+
+Pointer to the output aggregate
+
+
+
+The number of segments that comprise the segmented reduction data
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of beginning offsets of
+//! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
+//! element of the *i*\ :sup:`th` data segment in ``d_in``
+//!
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of ending offsets of length
+//! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+//! the *i*\ :sup:`th` data segment in ``d_in``.
+//! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
+//!
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+
+
+
+Finds the first device-wide minimum in each segment using the less-than (`<`) operator, also returning the in-segment index of that item.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceSegmentedReduce::ArgMin(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ InputIteratorT d_in,
+ OutputIteratorT d_out,
+ ::cuda::std::int64_t num_segments,
+ int segment_size,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v3.2.0. First appears in CUDA Toolkit 13.2.*
+
+
+The output value type of `d_out` is `::cuda::std::pair` (assuming the value type of `d_in` is `T`)
+The minimum of the *i*th segment is written to `d_out[i].second` and its offset in that segment is written to `d_out[i].first`.
+The `{1, ::cuda::std::numeric_limits::max()}` tuple is produced for zero-length inputs
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access input iterator type for reading input items (of some type `T`) (may be a simple pointer type)
+
+
+
+**[inferred]** Output iterator type for recording the reduced aggregate (having value type `cuda::std::pair`) (may be a simple pointer type)
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Pointer to the input sequence of data items
+
+
+
+Pointer to the output aggregate
+
+
+
+The number of segments that comprise the segmented reduction data
+
+
+
+The fixed segment size of each segment
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+
+
+
+### Max inline static
+
+
+
+
+Computes a device-wide segmented maximum using the greater-than (`>`) operator.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceSegmentedReduce::Max(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ InputIteratorT d_in,
+ OutputIteratorT d_out,
+ ::cuda::std::int64_t num_segments,
+ BeginOffsetIteratorT d_begin_offsets,
+ EndOffsetIteratorT d_end_offsets,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+Uses `::cuda::std::numeric_limits::lowest()` as the initial value of the reduction.
+When input a contiguous sequence of segments, a single sequence `segment_offsets` (of length `num_segments + 1`) can be aliased for both the `d_begin_offsets` and `d_end_offsets` parameters (where the latter is specified as `segment_offsets + 1`).
+Does not support `>` operators that are non-commutative.
+Let `s` be in `[0, num_segments)`. The range `[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])` shall not overlap `[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])`, `[d_begin_offsets, d_begin_offsets + num_segments)` nor `[d_end_offsets, d_end_offsets + num_segments)`.
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access input iterator type for reading input items (may be a simple pointer type)
+
+
+
+**[inferred]** Output iterator type for recording the reduced aggregate (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access input iterator type for reading segment beginning offsets (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access input iterator type for reading segment ending offsets (may be a simple pointer type)
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Pointer to the input sequence of data items
+
+
+
+Pointer to the output aggregate
+
+
+
+The number of segments that comprise the segmented reduction data
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of beginning offsets of
+//! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
+//! element of the *i*\ :sup:`th` data segment in ``d_in``
+//!
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of ending offsets of length
+//! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+//! the *i*\ :sup:`th` data segment in ``d_in``.
+//! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
+//!
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+
+
+
+Computes a device-wide segmented maximum using the greater-than (`>`) operator.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceSegmentedReduce::Max(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ InputIteratorT d_in,
+ OutputIteratorT d_out,
+ ::cuda::std::int64_t num_segments,
+ int segment_size,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v3.2.0. First appears in CUDA Toolkit 13.2.*
+
+
+Uses `::cuda::std::numeric_limits::lowest()` as the initial value of the reduction.
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access input iterator type for reading input items (may be a simple pointer type)
+
+
+
+**[inferred]** Output iterator type for recording the reduced aggregate (may be a simple pointer type)
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Pointer to the input sequence of data items
+
+
+
+Pointer to the output aggregate
+
+
+
+The number of segments that comprise the segmented reduction data
+
+
+
+The fixed segment size of each segment
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+
+
+
+### ArgMax inline static
+
+
+
+
+Finds the first device-wide maximum in each segment using the greater-than (`>`) operator, also returning the in-segment index of that item
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceSegmentedReduce::ArgMax(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ InputIteratorT d_in,
+ OutputIteratorT d_out,
+ ::cuda::std::int64_t num_segments,
+ BeginOffsetIteratorT d_begin_offsets,
+ EndOffsetIteratorT d_end_offsets,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The output value type of `d_out` is `cub::KeyValuePair` (assuming the value type of `d_in` is `T`)
+The maximum of the *i*th segment is written to `d_out[i].value` and its offset in that segment is written to `d_out[i].key`.
+The `{1, ::cuda::std::numeric_limits::lowest()}` tuple is produced for zero-length inputs
+When input a contiguous sequence of segments, a single sequence `segment_offsets` (of length `num_segments + 1`) can be aliased for both the `d_begin_offsets` and `d_end_offsets` parameters (where the latter is specified as `segment_offsets + 1`).
+Does not support `>` operators that are non-commutative.
+Let `s` be in `[0, num_segments)`. The range `[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])` shall not overlap `[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])`, `[d_begin_offsets, d_begin_offsets + num_segments)` nor `[d_end_offsets, d_end_offsets + num_segments)`.
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access input iterator type for reading input items (of some type `T`) (may be a simple pointer type)
+
+
+
+**[inferred]** Output iterator type for recording the reduced aggregate (having value type `KeyValuePair`) (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access input iterator type for reading segment beginning offsets (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access input iterator type for reading segment ending offsets (may be a simple pointer type)
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Pointer to the input sequence of data items
+
+
+
+Pointer to the output aggregate
+
+
+
+The number of segments that comprise the segmented reduction data
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of beginning offsets of
+//! length `num_segments`, such that ``d_begin_offsets[i]`` is the first
+//! element of the *i*\ :sup:`th` data segment in ``d_in``
+//!
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of ending offsets of length
+//! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+//! the *i*\ :sup:`th` data segment in ``d_in``.
+//! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
+//!
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+
+
+
+Finds the first device-wide maximum in each segment using the greater-than (`>`) operator, also returning the in-segment index of that item
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceSegmentedReduce::ArgMax(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ InputIteratorT d_in,
+ OutputIteratorT d_out,
+ ::cuda::std::int64_t num_segments,
+ int segment_size,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v3.2.0. First appears in CUDA Toolkit 13.2.*
+
+
+The output value type of `d_out` is `::cuda::std::pair` (assuming the value type of `d_in` is `T`)
+The maximum of the *i*th segment is written to `d_out[i].second` and its offset in that segment is written to `d_out[i].first`.
+The `{1, ::cuda::std::numeric_limits::lowest()}` tuple is produced for zero-length inputs
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access input iterator type for reading input items (of some type `T`) (may be a simple pointer type)
+
+
+
+**[inferred]** Output iterator type for recording the reduced aggregate (having value type `cuda::std::pair`) (may be a simple pointer type)
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Pointer to the input sequence of data items
+
+
+
+Pointer to the output aggregate
+
+
+
+The number of segments that comprise the segmented reduction data
+
+
+
+The fixed segment size of each segment
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+
+
diff --git a/fern/cudapages/cub/cub/cub/DeviceSegmentedScan.mdx b/fern/cudapages/cub/cub/cub/DeviceSegmentedScan.mdx
new file mode 100644
index 0000000..f99b279
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/DeviceSegmentedScan.mdx
@@ -0,0 +1,1186 @@
+---
+title: cub::DeviceSegmentedScan
+description: ""
+---
+
+DeviceSegmentedScan provides device-wide, parallel operations for computing a batched prefix scan across multiple sequences of data items residing within device-accessible memory.
+
+---
+
+## Static methods
+
+### ExclusiveSegmentedSum inline static
+
+
+
+
+Computes a device-wide segmented exclusive prefix sum.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceSegmentedScan::ExclusiveSegmentedSum(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ InputIteratorT d_in,
+ OutputIteratorT d_out,
+ BeginOffsetIteratorInputT d_in_begin_offsets,
+ EndOffsetIteratorInputT d_in_end_offsets,
+ ::cuda::std::int64_t num_segments,
+ cudaStream_t stream = 0
+)
+```
+
+
+
+Results are not deterministic for computation of prefix sum on floating-point types and may vary from run to run.
+When `d_in` and `d_out` are equal, the scan is performed in-place. The input and output sequences shall not overlap in any other way.
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access input iterator type for reading segmented scan inputs (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access output iterator type for writing segmented scan outputs (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access input iterator type for reading segment beginning offsets in the input data sequence (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access input iterator type for reading segment ending offsets in the input data sequence (may be a simple pointer type)
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Random-access iterator to the input sequence of data items
+
+
+
+Random-access iterator to the output sequence of data items
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of beginning offsets of
+//! length ``num_segments``, such that ``d_in_begin_offsets[i]`` is the first
+//! element of the \ *i*\ :sup:`th` data segment in ``d_in`` and in ``d_out``.
+//!
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of ending offsets of length
+//! ``num_segments``, such that ``d_in_end_offsets[i] - 1`` is the last element of
+//! the \ *i*\ :sup:`th` data segment in ``d_in``.
+//! If ``d_in_end_offsets[i] - 1 <= d_in_begin_offsets[i]``, the \ *i*\ :sup:`th`
+//! is considered empty.
+//!
+
+
+
+The number of segments that comprise the segmented prefix scan data.
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+**Example**
+
+The code snippet below illustrates the exclusive segmented prefix sum of an `int` device vector.
+
+```cpp showLineNumbers={false}
+#include
+// or, equivalently
+// #include
+
+// Declare, allocate, and initialize device-accessible pointers for
+// input and output
+int num_segments; // e.g., 3
+int *d_in; // e.g., [8, 6, 7, 5, 3, -2, 9]
+int *d_offsets; // e.g., [0, 2, 5, 7]
+int *d_out; // e.g., [ , , , , , , ]
+...
+
+// Determine temporary device storage requirements
+void *d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+cub::DeviceScan::ExclusiveSegmentedSum(
+ d_temp_storage, temp_storage_bytes,
+ d_in, d_out, d_offsets, d_offsets + 1, num_segments);
+
+// Allocate temporary storage
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Run exclusive prefix sum
+cub::DeviceScan::ExclusiveSegmentedSum(
+ d_temp_storage, temp_storage_bytes,
+ d_in, d_out, d_offsets, d_offsets + 1, num_segments);
+
+// d_out <-- [0, 8, 0, 7, 12, 0, -2]
+```
+
+
+
+
+Computes a device-wide segmented exclusive prefix sum.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceSegmentedScan::ExclusiveSegmentedSum(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ InputIteratorT d_in,
+ OutputIteratorT d_out,
+ BeginOffsetIteratorInputT d_in_begin_offsets,
+ EndOffsetIteratorInputT d_in_end_offsets,
+ BeginOffsetIteratorOutputT d_out_begin_offsets,
+ ::cuda::std::int64_t num_segments,
+ cudaStream_t stream = 0
+)
+```
+
+
+
+Results are not deterministic for computation of prefix sum on floating-point types and may vary from run to run.
+When `d_in` and `d_out` are equal, the scan is performed in-place. The input and output sequences shall not overlap in any other way.
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access input iterator type for reading segmented scan inputs (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access output iterator type for writing segmented scan outputs (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access input iterator type for reading segment beginning offsets in the input data sequence (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access input iterator type for reading segment ending offsets in the input data sequence (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access input iterator type for reading segment beginning offsets in the output sequence (may be a simple pointer type)
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Random-access iterator to the input sequence of data items
+
+
+
+Random-access iterator to the output sequence of data items
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of beginning offsets of
+//! length ``num_segments``, such that ``d_in_begin_offsets[i]`` is the first
+//! element of the \ *i*\ :sup:`th` data segment in ``d_in``
+//!
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of ending offsets of length
+//! ``num_segments``, such that ``d_in_end_offsets[i] - 1`` is the last element of
+//! the \ *i*\ :sup:`th` data segment in ``d_in``.
+//! If ``d_in_end_offsets[i] - 1 <= d_in_begin_offsets[i]``, the \ *i*\ :sup:`th`
+//! is considered empty.
+//!
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of beginning offsets of
+//! length ``num_segments``, such that ``d_out_begin_offsets[i]`` is the first
+//! element of the \ *i*\ :sup:`th` data segment in ``d_out``
+//!
+
+
+
+The number of segments that comprise the segmented prefix scan data.
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+
+
+
+### ExclusiveSegmentedScan inline static
+
+
+
+
+Computes a device-wide segmented exclusive prefix scan using the specified binary associative `scan_op` functor. The `init_value` value is applied as the initial value, and is assigned to the first element in each output segment.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceSegmentedScan::ExclusiveSegmentedScan(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ InputIteratorT d_in,
+ OutputIteratorT d_out,
+ BeginOffsetIteratorInputT d_in_begin_offsets,
+ EndOffsetIteratorInputT d_in_end_offsets,
+ ::cuda::std::int64_t num_segments,
+ ScanOpT scan_op,
+ InitValueT init_value,
+ cudaStream_t stream = 0
+)
+```
+
+
+
+Supports non-commutative scan operators.
+Results are not deterministic for pseudo-associative operators (e.g., addition of floating-point types). Results for pseudo-associative operators may vary from run to run.
+When `d_in` and `d_out` are equal, the scan is performed in-place. The input and output sequences shall not overlap in any other way.
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access input iterator type for reading segmented scan inputs (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access output iterator type for writing segmented scan outputs (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access input iterator type for reading segment beginning offsets in the input data sequence (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access input iterator type for reading segment ending offsets in the input data sequence (may be a simple pointer type)
+
+
+
+**[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
+
+
+
+**[inferred]** Type of the `init_value`
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Random-access iterator to the input sequence of data items
+
+
+
+Random-access iterator to the output sequence of data items
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of beginning offsets of
+//! length ``num_segments``, such that ``d_in_begin_offsets[i]`` is the first
+//! element of the \ *i*\ :sup:`th` data segment in ``d_in`` and in ``d_out``
+//!
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of ending offsets of length
+//! ``num_segments``, such that ``d_in_end_offsets[i] - 1`` is the last element of
+//! the \ *i*\ :sup:`th` data segment in ``d_in``.
+//! If ``d_in_end_offsets[i] - 1 <= d_in_begin_offsets[i]``, the \ *i*\ :sup:`th`
+//! is considered empty.
+//!
+
+
+
+The number of segments that comprise the segmented prefix scan data.
+
+
+
+Binary associative scan functor
+
+
+
+Initial value to seed the exclusive scan for each segment in the output sequence
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+
+
+
+Computes a device-wide segmented exclusive prefix scan using the specified binary associative `scan_op` functor. The `init_value` value is applied as the initial value, and is assigned to the first element in each output segment.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceSegmentedScan::ExclusiveSegmentedScan(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ InputIteratorT d_in,
+ OutputIteratorT d_out,
+ BeginOffsetIteratorInputT d_in_begin_offsets,
+ EndOffsetIteratorInputT d_in_end_offsets,
+ BeginOffsetIteratorOutputT d_out_begin_offsets,
+ ::cuda::std::int64_t num_segments,
+ ScanOpT scan_op,
+ InitValueT init_value,
+ cudaStream_t stream = 0
+)
+```
+
+
+
+Supports non-commutative scan operators.
+Results are not deterministic for pseudo-associative operators (e.g., addition of floating-point types). Results for pseudo-associative operators may vary from run to run.
+When `d_in` and `d_out` are equal, the scan is performed in-place. The input and output sequences shall not overlap in any other way.
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access input iterator type for reading segmented scan inputs (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access output iterator type for writing segmented scan outputs (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access input iterator type for reading segment beginning offsets in the input data sequence (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access input iterator type for reading segment ending offsets in the input data sequence (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access input iterator type for reading segment beginning offsets in the output sequence (may be a simple pointer type)
+
+
+
+**[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
+
+
+
+**[inferred]** Type of the `init_value`
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Random-access iterator to the input sequence of data items
+
+
+
+Random-access iterator to the output sequence of data items
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of beginning offsets of
+//! length ``num_segments``, such that ``d_in_begin_offsets[i]`` is the first
+//! element of the \ *i*\ :sup:`th` data segment in ``d_in``
+//!
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of ending offsets of length
+//! ``num_segments``, such that ``d_in_end_offsets[i] - 1`` is the last element of
+//! the \ *i*\ :sup:`th` data segment in ``d_in``.
+//! If ``d_in_end_offsets[i] - 1 <= d_in_begin_offsets[i]``, the \ *i*\ :sup:`th`
+//! is considered empty.
+//!
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of beginning offsets of
+//! length ``num_segments``, such that ``d_out_begin_offsets[i]`` is the first
+//! element of the \ *i*\ :sup:`th` data segment in ``d_out``
+//!
+
+
+
+The number of segments that comprise the segmented prefix scan data.
+
+
+
+Binary associative scan functor
+
+
+
+Initial value to seed the exclusive scan for each segment in the output sequence
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+
+
+
+### InclusiveSegmentedSum inline static
+
+
+
+
+Computes a device-wide segmented inclusive prefix sum.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceSegmentedScan::InclusiveSegmentedSum(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ InputIteratorT d_in,
+ OutputIteratorT d_out,
+ BeginOffsetIteratorInputT d_in_begin_offsets,
+ EndOffsetIteratorInputT d_in_end_offsets,
+ ::cuda::std::int64_t num_segments,
+ cudaStream_t stream = 0
+)
+```
+
+
+
+Results are not deterministic for computation of prefix sum on floating-point types and may vary from run to run.
+When `d_in` and `d_out` are equal, the scan is performed in-place. The input and output sequences shall not overlap in any other way.
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access input iterator type for reading segmented scan inputs (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access output iterator type for writing segmented scan outputs (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access input iterator type for reading segment beginning offsets in the input data sequence (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access input iterator type for reading segment ending offsets in the input data sequence (may be a simple pointer type)
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Random-access iterator to the input sequence of data items
+
+
+
+Random-access iterator to the output sequence of data items
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of beginning offsets of
+//! length ``num_segments``, such that ``d_in_begin_offsets[i]`` is the first
+//! element of the \ *i*\ :sup:`th` data segment in ``d_in`` and in ``d_out``
+//!
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of ending offsets of length
+//! ``num_segments``, such that ``d_in_end_offsets[i] - 1`` is the last element of
+//! the \ *i*\ :sup:`th` data segment in ``d_in``.
+//! If ``d_in_end_offsets[i] - 1 <= d_in_begin_offsets[i]``, the \ *i*\ :sup:`th`
+//! is considered empty.
+//!
+
+
+
+The number of segments that comprise the segmented prefix scan data.
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+
+
+
+Computes a device-wide segmented inclusive prefix sum.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceSegmentedScan::InclusiveSegmentedSum(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ InputIteratorT d_in,
+ OutputIteratorT d_out,
+ BeginOffsetIteratorInputT d_in_begin_offsets,
+ EndOffsetIteratorInputT d_in_end_offsets,
+ BeginOffsetIteratorOutputT d_out_begin_offsets,
+ ::cuda::std::int64_t num_segments,
+ cudaStream_t stream = 0
+)
+```
+
+
+
+Results are not deterministic for computation of prefix sum on floating-point types and may vary from run to run.
+When `d_in` and `d_out` are equal, the scan is performed in-place. The input and output sequences shall not overlap in any other way.
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access input iterator type for reading segmented scan inputs (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access output iterator type for writing segmented scan outputs (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access input iterator type for reading segment beginning offsets in the input data sequence (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access input iterator type for reading segment ending offsets in the input data sequence (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access input iterator type for reading segment beginning offsets in the output sequence (may be a simple pointer type)
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Random-access iterator to the input sequence of data items
+
+
+
+Random-access iterator to the output sequence of data items
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of beginning offsets of
+//! length ``num_segments``, such that ``d_in_begin_offsets[i]`` is the first
+//! element of the \ *i*\ :sup:`th` data segment in ``d_in``
+//!
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of ending offsets of length
+//! ``num_segments``, such that ``d_in_end_offsets[i] - 1`` is the last element of
+//! the \ *i*\ :sup:`th` data segment in ``d_in``.
+//! If ``d_in_end_offsets[i] - 1 <= d_in_begin_offsets[i]``, the \ *i*\ :sup:`th`
+//! is considered empty.
+//!
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of beginning offsets of
+//! length ``num_segments``, such that ``d_out_begin_offsets[i]`` is the first
+//! element of the \ *i*\ :sup:`th` data segment in ``d_out``
+//!
+
+
+
+The number of segments that comprise the segmented prefix scan data.
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+
+
+
+### InclusiveSegmentedScan inline static
+
+
+
+
+Computes a device-wide segmented inclusive prefix scan using the specified binary associative `scan_op` functor.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceSegmentedScan::InclusiveSegmentedScan(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ InputIteratorT d_in,
+ OutputIteratorT d_out,
+ BeginOffsetIteratorInputT d_in_begin_offsets,
+ EndOffsetIteratorInputT d_in_end_offsets,
+ ::cuda::std::int64_t num_segments,
+ ScanOpT scan_op,
+ cudaStream_t stream = 0
+)
+```
+
+
+
+Supports non-commutative scan operators.
+Results are not deterministic for pseudo-associative operators (e.g., addition of floating-point types). Results for pseudo-associative operators may vary from run to run.
+When `d_in` and `d_out` are equal, the scan is performed in-place. The input and output sequences shall not overlap in any other way.
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access input iterator type for reading segmented scan inputs (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access output iterator type for writing segmented scan outputs (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access input iterator type for reading segment beginning offsets in the input data sequence (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access input iterator type for reading segment ending offsets in the input data sequence (may be a simple pointer type)
+
+
+
+**[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Random-access iterator to the input sequence of data items
+
+
+
+Random-access iterator to the output sequence of data items
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of beginning offsets of
+//! length ``num_segments``, such that ``d_in_begin_offsets[i]`` is the first
+//! element of the \ *i*\ :sup:`th` data segment in ``d_in`` and in ``d_out``
+//!
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of ending offsets of length
+//! ``num_segments``, such that ``d_in_end_offsets[i] - 1`` is the last element of
+//! the \ *i*\ :sup:`th` data segment in ``d_in``.
+//! If ``d_in_end_offsets[i] - 1 <= d_in_begin_offsets[i]``, the \ *i*\ :sup:`th`
+//! is considered empty.
+//!
+
+
+
+The number of segments that comprise the segmented prefix scan data.
+
+
+
+Binary associative scan functor
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+
+
+
+Computes a device-wide segmented inclusive prefix scan using the specified binary associative `scan_op` functor.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceSegmentedScan::InclusiveSegmentedScan(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ InputIteratorT d_in,
+ OutputIteratorT d_out,
+ BeginOffsetIteratorInputT d_in_begin_offsets,
+ EndOffsetIteratorInputT d_in_end_offsets,
+ BeginOffsetIteratorOutputT d_out_begin_offsets,
+ ::cuda::std::int64_t num_segments,
+ ScanOpT scan_op,
+ cudaStream_t stream = 0
+)
+```
+
+
+
+Supports non-commutative scan operators.
+Results are not deterministic for pseudo-associative operators (e.g., addition of floating-point types). Results for pseudo-associative operators may vary from run to run.
+When `d_in` and `d_out` are equal, the scan is performed in-place. The input and output sequences shall not overlap in any other way.
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access input iterator type for reading segmented scan inputs (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access output iterator type for writing segmented scan outputs (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access input iterator type for reading segment beginning offsets in the input data sequence (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access input iterator type for reading segment ending offsets in the input data sequence (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access input iterator type for reading segment beginning offsets in the output sequence (may be a simple pointer type)
+
+
+
+**[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Random-access iterator to the input sequence of data items
+
+
+
+Random-access iterator to the output sequence of data items
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of beginning offsets of
+//! length ``num_segments``, such that ``d_in_begin_offsets[i]`` is the first
+//! element of the \ *i*\ :sup:`th` data segment in ``d_in``
+//!
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of ending offsets of length
+//! ``num_segments``, such that ``d_in_end_offsets[i] - 1`` is the last element of
+//! the \ *i*\ :sup:`th` data segment in ``d_in``.
+//! If ``d_in_end_offsets[i] - 1 <= d_in_begin_offsets[i]``, the \ *i*\ :sup:`th`
+//! is considered empty.
+//!
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of beginning offsets of
+//! length ``num_segments``, such that ``d_out_begin_offsets[i]`` is the first
+//! element of the \ *i*\ :sup:`th` data segment in ``d_out``
+//!
+
+
+
+The number of segments that comprise the segmented prefix scan data.
+
+
+
+Binary associative scan functor
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+
+
+
+### InclusiveSegmentedScanInit inline static
+
+
+
+
+Computes a device-wide segmented inclusive prefix scan using the specified binary associative `scan_op` functor. The result of applying the `scan_op` binary operator to `init_value` value and the first value in each input segment is assigned to the first value of the corresponding output segment.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceSegmentedScan::InclusiveSegmentedScanInit(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ InputIteratorT d_in,
+ OutputIteratorT d_out,
+ BeginOffsetIteratorInputT d_in_begin_offsets,
+ EndOffsetIteratorInputT d_in_end_offsets,
+ ::cuda::std::int64_t num_segments,
+ ScanOpT scan_op,
+ InitValueT init_value,
+ cudaStream_t stream = 0
+)
+```
+
+
+
+Supports non-commutative scan operators.
+Results are not deterministic for pseudo-associative operators (e.g., addition of floating-point types). Results for pseudo-associative operators may vary from run to run.
+When `d_in` and `d_out` are equal, the scan is performed in-place. The input and output sequences shall not overlap in any other way.
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access input iterator type for reading segmented scan inputs (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access output iterator type for writing segmented scan outputs (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access input iterator type for reading segment beginning offsets in the input data sequence (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access input iterator type for reading segment ending offsets in the input data sequence (may be a simple pointer type)
+
+
+
+**[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
+
+
+
+**[inferred]** Type of the `init_value`
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Random-access iterator to the input sequence of data items
+
+
+
+Random-access iterator to the output sequence of data items
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of beginning offsets of
+//! length ``num_segments``, such that ``d_in_begin_offsets[i]`` is the first
+//! element of the \ *i*\ :sup:`th` data segment in ``d_in`` and in ``d_out``
+//!
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of ending offsets of length
+//! ``num_segments``, such that ``d_in_end_offsets[i] - 1`` is the last element of
+//! the \ *i*\ :sup:`th` data segment in ``d_in``.
+//! If ``d_in_end_offsets[i] - 1 <= d_in_begin_offsets[i]``, the \ *i*\ :sup:`th`
+//! is considered empty.
+//!
+
+
+
+The number of segments that comprise the segmented prefix scan data.
+
+
+
+Binary associative scan functor
+
+
+
+Initial value to seed the exclusive scan for each segment in the output sequence
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+
+
+
+Computes a device-wide segmented inclusive prefix scan using the specified binary associative `scan_op` functor. The result of applying the `scan_op` binary operator to `init_value` value and the first value in each input segment is assigned to the first value of the corresponding output segment.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceSegmentedScan::InclusiveSegmentedScanInit(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ InputIteratorT d_in,
+ OutputIteratorT d_out,
+ BeginOffsetIteratorInputT d_in_begin_offsets,
+ EndOffsetIteratorInputT d_in_end_offsets,
+ BeginOffsetIteratorOutputT d_out_begin_offsets,
+ ::cuda::std::int64_t num_segments,
+ ScanOpT scan_op,
+ InitValueT init_value,
+ cudaStream_t stream = 0
+)
+```
+
+
+
+Supports non-commutative scan operators.
+Results are not deterministic for pseudo-associative operators (e.g., addition of floating-point types). Results for pseudo-associative operators may vary from run to run.
+When `d_in` and `d_out` are equal, the scan is performed in-place. The input and output sequences shall not overlap in any other way.
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access input iterator type for reading segmented scan inputs (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access output iterator type for writing segmented scan outputs (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access input iterator type for reading segment beginning offsets in the input data sequence (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access input iterator type for reading segment ending offsets in the input data sequence (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access input iterator type for reading segment beginning offsets in the output sequence (may be a simple pointer type)
+
+
+
+**[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
+
+
+
+**[inferred]** Type of the `init_value`
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Random-access iterator to the input sequence of data items
+
+
+
+Random-access iterator to the output sequence of data items
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of beginning offsets of
+//! length ``num_segments``, such that ``d_in_begin_offsets[i]`` is the first
+//! element of the \ *i*\ :sup:`th` data segment in ``d_in``
+//!
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of ending offsets of length
+//! ``num_segments``, such that ``d_in_end_offsets[i] - 1`` is the last element of
+//! the \ *i*\ :sup:`th` data segment in ``d_in``.
+//! If ``d_in_end_offsets[i] - 1 <= d_in_begin_offsets[i]``, the \ *i*\ :sup:`th`
+//! is considered empty.
+//!
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of beginning offsets of
+//! length ``num_segments``, such that ``d_out_begin_offsets[i]`` is the first
+//! element of the \ *i*\ :sup:`th` data segment in ``d_out``
+//!
+
+
+
+The number of segments that comprise the segmented prefix scan data.
+
+
+
+Binary associative scan functor
+
+
+
+Initial value to seed the exclusive scan for each segment in the output sequence
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+
+
diff --git a/fern/cudapages/cub/cub/cub/DeviceSegmentedSort.mdx b/fern/cudapages/cub/cub/cub/DeviceSegmentedSort.mdx
new file mode 100644
index 0000000..390dc4c
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/DeviceSegmentedSort.mdx
@@ -0,0 +1,2522 @@
+---
+title: cub::DeviceSegmentedSort
+description: ""
+---
+
+DeviceSegmentedSort provides device-wide, parallel operations for computing a batched sort across multiple, non-overlapping sequences of data items residing within device-accessible memory.
+
+## Example
+
+```cpp showLineNumbers={false}
+#include
+// or equivalently
+
+// Declare, allocate, and initialize device-accessible pointers
+// for sorting data
+int num_items; // e.g., 7
+int num_segments; // e.g., 3
+int *d_offsets; // e.g., [0, 3, 3, 7]
+int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
+int *d_keys_out; // e.g., [-, -, -, -, -, -, -]
+int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6]
+int *d_values_out; // e.g., [-, -, -, -, -, -, -]
+...
+
+// Determine temporary device storage requirements
+void *d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+cub::DeviceSegmentedSort::SortPairs(
+ d_temp_storage, temp_storage_bytes,
+ d_keys_in, d_keys_out, d_values_in, d_values_out,
+ num_items, num_segments, d_offsets, d_offsets + 1);
+
+// Allocate temporary storage
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Run sorting operation
+cub::DeviceSegmentedSort::SortPairs(
+ d_temp_storage, temp_storage_bytes,
+ d_keys_in, d_keys_out, d_values_in, d_values_out,
+ num_items, num_segments, d_offsets, d_offsets + 1);
+
+// d_keys_out <-- [6, 7, 8, 0, 3, 5, 9]
+// d_values_out <-- [1, 2, 0, 5, 4, 3, 6]
+```
+
+---
+
+## Keys-only
+
+### SortKeysDescendingNoNVTX inline static
+
+
+
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceSegmentedSort::SortKeysDescendingNoNVTX(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ const KeyT *d_keys_in,
+ KeyT *d_keys_out,
+ ::cuda::std::int64_t num_items,
+ ::cuda::std::int64_t num_segments,
+ BeginOffsetIteratorT d_begin_offsets,
+ EndOffsetIteratorT d_end_offsets,
+ cudaStream_t stream = 0
+)
+```
+
+
+
+
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceSegmentedSort::SortKeysDescendingNoNVTX(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ DoubleBuffer &d_keys,
+ ::cuda::std::int64_t num_items,
+ ::cuda::std::int64_t num_segments,
+ BeginOffsetIteratorT d_begin_offsets,
+ EndOffsetIteratorT d_end_offsets,
+ cudaStream_t stream = 0
+)
+```
+
+
+
+
+
+### SortKeysNoNVTX inline static
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceSegmentedSort::SortKeysNoNVTX(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ DoubleBuffer &d_keys,
+ ::cuda::std::int64_t num_items,
+ ::cuda::std::int64_t num_segments,
+ BeginOffsetIteratorT d_begin_offsets,
+ EndOffsetIteratorT d_end_offsets,
+ cudaStream_t stream = 0
+)
+```
+
+
+### SortPairsNoNVTX inline static
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceSegmentedSort::SortPairsNoNVTX(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ const KeyT *d_keys_in,
+ KeyT *d_keys_out,
+ const ValueT *d_values_in,
+ ValueT *d_values_out,
+ ::cuda::std::int64_t num_items,
+ ::cuda::std::int64_t num_segments,
+ BeginOffsetIteratorT d_begin_offsets,
+ EndOffsetIteratorT d_end_offsets,
+ cudaStream_t stream = 0
+)
+```
+
+
+### SortKeys inline static
+
+
+
+
+Sorts segments of keys into ascending order. Approximately `num_items + 2 * num_segments` auxiliary storage required.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceSegmentedSort::SortKeys(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ const KeyT *d_keys_in,
+ KeyT *d_keys_out,
+ ::cuda::std::int64_t num_items,
+ ::cuda::std::int64_t num_segments,
+ BeginOffsetIteratorT d_begin_offsets,
+ EndOffsetIteratorT d_end_offsets,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The contents of the input data are not altered by the sorting operation.
+When the input is a contiguous sequence of segments, a single sequence `segment_offsets` (of length `num_segments + 1`) can be aliased for both the `d_begin_offsets` and `d_end_offsets` parameters (where the latter is specified as `segment_offsets+1`).
+SortKeys is not guaranteed to be stable. That is, suppose that `i` and `j` are equivalent: neither one is less than the other. It is not guaranteed that the relative order of these two elements will be preserved by sort.
+The range `[d_keys_out, d_keys_out + num_items)` shall not overlap `[d_keys_in, d_keys_in + num_items)`, `[d_begin_offsets, d_begin_offsets + num_segments)` nor `[d_end_offsets, d_end_offsets + num_segments)` in any way.
+Segments are not required to be contiguous. For all index values `i` outside the specified segments `d_keys_in[i]`, `d_keys_out[i]` will not be accessed nor modified.
+
+
+**Template parameters**
+
+
+**[inferred]** Key type
+
+
+
+**[inferred]** Random-access input iterator type for reading segment beginning offsets (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access input iterator type for reading segment ending offsets (may be a simple pointer type)
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When nullptr, the required allocation size is written to `temp_storage_bytes` and no work is done
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Device-accessible pointer to the input data of key data to sort
+
+
+
+Device-accessible pointer to the sorted output sequence of key data
+
+
+
+The total number of items to sort (across all segments)
+
+
+
+The number of segments that comprise the sorting data
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of beginning offsets of
+//! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
+//! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
+//!
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of ending offsets of length
+//! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+//! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+//! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the i-th segment is considered empty.
+//!
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+**Example**
+
+The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of `int` keys.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+// Declare, allocate, and initialize device-accessible
+// pointers for sorting data
+int num_items; // e.g., 7
+int num_segments; // e.g., 3
+int *d_offsets; // e.g., [0, 3, 3, 7]
+int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
+int *d_keys_out; // e.g., [-, -, -, -, -, -, -]
+...
+
+// Determine temporary device storage requirements
+void *d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+cub::DeviceSegmentedSort::SortKeys(
+ d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
+ num_items, num_segments, d_offsets, d_offsets + 1);
+
+// Allocate temporary storage
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Run sorting operation
+cub::DeviceSegmentedSort::SortKeys(
+ d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
+ num_items, num_segments, d_offsets, d_offsets + 1);
+
+// d_keys_out <-- [6, 7, 8, 0, 3, 5, 9]
+```
+
+
+
+
+Sorts segments of keys into ascending order. Approximately `2 * num_segments` auxiliary storage required.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceSegmentedSort::SortKeys(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ DoubleBuffer &d_keys,
+ ::cuda::std::int64_t num_items,
+ ::cuda::std::int64_t num_segments,
+ BeginOffsetIteratorT d_begin_offsets,
+ EndOffsetIteratorT d_end_offsets,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The sorting operation is given a pair of key buffers managed by a DoubleBuffer structure that indicates which of the two buffers is "current" (and thus contains the input data to be sorted).
+The contents of both buffers may be altered by the sorting operation.
+Upon completion, the sorting operation will update the "current" indicator within the DoubleBuffer wrapper to reference which of the two buffers now contains the sorted output sequence (a function of the number of key bits and the targeted device architecture).
+When the input is a contiguous sequence of segments, a single sequence `segment_offsets` (of length `num_segments + 1`) can be aliased for both the `d_begin_offsets` and `d_end_offsets` parameters (where the latter is specified as `segment_offsets +1`).
+SortKeys is not guaranteed to be stable. That is, suppose that `i` and `j` are equivalent: neither one is less than the other. It is not guaranteed that the relative order of these two elements will be preserved by sort.
+Let `cur = d_keys.Current()` and `alt = d_keys.Alternate()`. The range `[cur, cur + num_items)` shall not overlap `[alt, alt + num_items)`. Both ranges shall not overlap `[d_begin_offsets, d_begin_offsets + num_segments)` nor `[d_end_offsets, d_end_offsets + num_segments)` in any way.
+Segments are not required to be contiguous. For all index values `i` outside the specified segments `d_keys.Current()[i]`, `d_keys[i].Alternate()[i]` will not be accessed nor modified.
+
+
+**Template parameters**
+
+
+**[inferred]** Key type
+
+
+
+**[inferred]** Random-access input iterator type for reading segment beginning offsets (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access input iterator type for reading segment ending offsets (may be a simple pointer type)
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When nullptr, the required allocation size is written to `temp_storage_bytes` and no work is done
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+
+
+
+The total number of items to sort (across all segments)
+
+
+
+The number of segments that comprise the sorting data
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of beginning offsets of
+//! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
+//! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
+//!
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of ending offsets of length
+//! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+//! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+//! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the ``i``-th segment is considered empty.
+//!
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+**Example**
+
+The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of `i` nt keys.
+
+```cpp showLineNumbers={false}
+#include
+// or equivalently
+
+// Declare, allocate, and initialize device-accessible
+// pointers for sorting data
+int num_items; // e.g., 7
+int num_segments; // e.g., 3
+int *d_offsets; // e.g., [0, 3, 3, 7]
+int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9]
+int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -]
+...
+
+// Create a DoubleBuffer to wrap the pair of device pointers
+cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf);
+
+// Determine temporary device storage requirements
+void *d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+cub::DeviceSegmentedSort::SortKeys(
+ d_temp_storage, temp_storage_bytes, d_keys,
+ num_items, num_segments, d_offsets, d_offsets + 1);
+
+// Allocate temporary storage
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Run sorting operation
+cub::DeviceSegmentedSort::SortKeys(
+ d_temp_storage, temp_storage_bytes, d_keys,
+ num_items, num_segments, d_offsets, d_offsets + 1);
+
+// d_keys.Current() <-- [6, 7, 8, 0, 3, 5, 9]
+```
+
+
+
+
+### SortKeysDescending inline static
+
+
+
+
+Sorts segments of keys into descending order. Approximately `num_items + 2 * num_segments` auxiliary storage required.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceSegmentedSort::SortKeysDescending(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ const KeyT *d_keys_in,
+ KeyT *d_keys_out,
+ ::cuda::std::int64_t num_items,
+ ::cuda::std::int64_t num_segments,
+ BeginOffsetIteratorT d_begin_offsets,
+ EndOffsetIteratorT d_end_offsets,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The contents of the input data are not altered by the sorting operation.
+When the input is a contiguous sequence of segments, a single sequence `segment_offsets` (of length `num_segments + 1`) can be aliased for both the `d_begin_offsets` and `d_end_offsets` parameters (where the latter is specified as `segment_offsets + 1`).
+SortKeysDescending is not guaranteed to be stable. That is, suppose that `i` and `j` are equivalent: neither one is less than the other. It is not guaranteed that the relative order of these two elements will be preserved by sort.
+The range `[d_keys_out, d_keys_out + num_items)` shall not overlap `[d_keys_in, d_keys_in + num_items)`, `[d_begin_offsets, d_begin_offsets + num_segments)` nor `[d_end_offsets, d_end_offsets + num_segments)` in any way.
+Segments are not required to be contiguous. For all index values `i` outside the specified segments `d_keys_in[i]`, `d_keys_out[i]` will not be accessed nor modified.
+
+
+**Template parameters**
+
+
+**[inferred]** Key type
+
+
+
+**[inferred]** Random-access input iterator type for reading segment beginning offsets (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access input iterator type for reading segment ending offsets (may be a simple pointer type)
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When nullptr, the required allocation size is written to `temp_storage_bytes` and no work is done
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Device-accessible pointer to the input data of key data to sort
+
+
+
+Device-accessible pointer to the sorted output sequence of key data
+
+
+
+The total number of items to sort (across all segments)
+
+
+
+The number of segments that comprise the sorting data
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of beginning offsets of
+//! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
+//! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
+//!
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of ending offsets of length
+//! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+//! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+//! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the ``i``-th segment is considered empty.
+//!
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+**Example**
+
+The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of `i` nt keys.
+
+```cpp showLineNumbers={false}
+#include
+// or equivalently
+
+// Declare, allocate, and initialize device-accessible pointers
+// for sorting data
+int num_items; // e.g., 7
+int num_segments; // e.g., 3
+int *d_offsets; // e.g., [0, 3, 3, 7]
+int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
+int *d_keys_out; // e.g., [-, -, -, -, -, -, -]
+...
+
+// Determine temporary device storage requirements
+void *d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+cub::DeviceSegmentedSort::SortKeysDescending(
+ d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
+ num_items, num_segments, d_offsets, d_offsets + 1);
+
+// Allocate temporary storage
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Run sorting operation
+cub::DeviceSegmentedSort::SortKeysDescending(
+ d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
+ num_items, num_segments, d_offsets, d_offsets + 1);
+
+// d_keys_out <-- [8, 7, 6, 9, 5, 3, 0]
+```
+
+
+
+
+Sorts segments of keys into descending order. Approximately `2 * num_segments` auxiliary storage required.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceSegmentedSort::SortKeysDescending(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ DoubleBuffer &d_keys,
+ ::cuda::std::int64_t num_items,
+ ::cuda::std::int64_t num_segments,
+ BeginOffsetIteratorT d_begin_offsets,
+ EndOffsetIteratorT d_end_offsets,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The sorting operation is given a pair of key buffers managed by a DoubleBuffer structure that indicates which of the two buffers is "current" (and thus contains the input data to be sorted).
+The contents of both buffers may be altered by the sorting operation.
+Upon completion, the sorting operation will update the "current" indicator within the DoubleBuffer wrapper to reference which of the two buffers now contains the sorted output sequence (a function of the number of key bits and the targeted device architecture).
+When the input is a contiguous sequence of segments, a single sequence `segment_offsets` (of length `num_segments + 1`) can be aliased for both the `d_begin_offsets` and `d_end_offsets` parameters (where the latter is specified as `segment_offsets + 1`).
+SortKeysDescending is not guaranteed to be stable. That is, suppose that `i` and `j` are equivalent: neither one is less than the other. It is not guaranteed that the relative order of these two elements will be preserved by sort.
+Let `cur = d_keys.Current()` and `alt = d_keys.Alternate()`. The range `[cur, cur + num_items)` shall not overlap `[alt, alt + num_items)`. Both ranges shall not overlap `[d_begin_offsets, d_begin_offsets + num_segments)` nor `[d_end_offsets, d_end_offsets + num_segments)` in any way.
+Segments are not required to be contiguous. For all index values `i` outside the specified segments `d_keys.Current()[i]`, `d_keys[i].Alternate()[i]` will not be accessed nor modified.
+
+
+**Template parameters**
+
+
+**[inferred]** Key type
+
+
+
+**[inferred]** Random-access input iterator type for reading segment beginning offsets (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access input iterator type for reading segment ending offsets (may be a simple pointer type)
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+
+
+
+The total number of items to sort (across all segments)
+
+
+
+The number of segments that comprise the sorting data
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of beginning offsets of
+//! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
+//! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
+//!
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of ending offsets of length
+//! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+//! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+//! If ``d_end_offsets[i] - 1<= d_begin_offsets[i]``, the ``i``-th segment is considered empty.
+//!
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+**Example**
+
+The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of `i` nt keys.
+
+```cpp showLineNumbers={false}
+#include
+// or equivalently
+
+// Declare, allocate, and initialize device-accessible pointers for
+// sorting data
+int num_items; // e.g., 7
+int num_segments; // e.g., 3
+int *d_offsets; // e.g., [0, 3, 3, 7]
+int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9]
+int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -]
+...
+
+// Create a DoubleBuffer to wrap the pair of device pointers
+cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf);
+
+// Determine temporary device storage requirements
+void *d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+cub::DeviceSegmentedSort::SortKeysDescending(
+ d_temp_storage, temp_storage_bytes, d_keys,
+ num_items, num_segments, d_offsets, d_offsets + 1);
+
+// Allocate temporary storage
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Run sorting operation
+cub::DeviceSegmentedSort::SortKeysDescending(
+ d_temp_storage, temp_storage_bytes, d_keys,
+ num_items, num_segments, d_offsets, d_offsets + 1);
+
+// d_keys.Current() <-- [8, 7, 6, 9, 5, 3, 0]
+```
+
+
+
+
+### StableSortKeys inline static
+
+
+
+
+Sorts segments of keys into ascending order. Approximately `num_items + 2 * num_segments` auxiliary storage required.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceSegmentedSort::StableSortKeys(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ const KeyT *d_keys_in,
+ KeyT *d_keys_out,
+ ::cuda::std::int64_t num_items,
+ ::cuda::std::int64_t num_segments,
+ BeginOffsetIteratorT d_begin_offsets,
+ EndOffsetIteratorT d_end_offsets,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The contents of the input data are not altered by the sorting operation.
+When the input is a contiguous sequence of segments, a single sequence `segment_offsets` (of length `num_segments + 1`) can be aliased for both the `d_begin_offsets` and `d_end_offsets` parameters (where the latter is specified as `segment_offsets + 1`).
+StableSortKeys is stable: it preserves the relative ordering of equivalent elements. That is, if `x` and `y` are elements such that `x` precedes `y`, and if the two elements are equivalent (neither `x < y` nor `y < x`) then a postcondition of stable sort is that `x` still precedes `y`.
+The range `[d_keys_out, d_keys_out + num_items)` shall not overlap `[d_keys_in, d_keys_in + num_items)`, `[d_begin_offsets, d_begin_offsets + num_segments)` nor `[d_end_offsets, d_end_offsets + num_segments)` in any way.
+Segments are not required to be contiguous. For all index values `i` outside the specified segments `d_keys_in[i]`, `d_keys_out[i]` will not be accessed nor modified.
+
+
+**Template parameters**
+
+
+**[inferred]** Key type
+
+
+
+**[inferred]** Random-access input iterator type for reading segment beginning offsets (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access input iterator type for reading segment ending offsets (may be a simple pointer type)
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When nullptr, the required allocation size is written to `temp_storage_bytes` and no work is done
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Device-accessible pointer to the input data of key data to sort
+
+
+
+Device-accessible pointer to the sorted output sequence of key data
+
+
+
+The total number of items to sort (across all segments)
+
+
+
+The number of segments that comprise the sorting data
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of beginning offsets of
+//! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
+//! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
+//!
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of ending offsets of length
+//! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+//! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+//! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the ``i``-th segment is considered empty.
+//!
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+**Example**
+
+The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of `i` nt keys.
+
+```cpp showLineNumbers={false}
+#include
+// or equivalently
+
+// Declare, allocate, and initialize device-accessible pointers
+// for sorting data
+int num_items; // e.g., 7
+int num_segments; // e.g., 3
+int *d_offsets; // e.g., [0, 3, 3, 7]
+int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
+int *d_keys_out; // e.g., [-, -, -, -, -, -, -]
+...
+
+// Determine temporary device storage requirements
+void *d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+cub::DeviceSegmentedSort::StableSortKeys(
+ d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
+ num_items, num_segments, d_offsets, d_offsets + 1);
+
+// Allocate temporary storage
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Run sorting operation
+cub::DeviceSegmentedSort::StableSortKeys(
+ d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
+ num_items, num_segments, d_offsets, d_offsets + 1);
+
+// d_keys_out <-- [6, 7, 8, 0, 3, 5, 9]
+```
+
+
+
+
+Sorts segments of keys into ascending order. Approximately `2 * num_segments` auxiliary storage required.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceSegmentedSort::StableSortKeys(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ DoubleBuffer &d_keys,
+ ::cuda::std::int64_t num_items,
+ ::cuda::std::int64_t num_segments,
+ BeginOffsetIteratorT d_begin_offsets,
+ EndOffsetIteratorT d_end_offsets,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The sorting operation is given a pair of key buffers managed by a DoubleBuffer structure that indicates which of the two buffers is "current" (and thus contains the input data to be sorted).
+The contents of both buffers may be altered by the sorting operation.
+Upon completion, the sorting operation will update the "current" indicator within the DoubleBuffer wrapper to reference which of the two buffers now contains the sorted output sequence (a function of the number of key bits and the targeted device architecture).
+When the input is a contiguous sequence of segments, a single sequence `segment_offsets` (of length `num_segments + 1`) can be aliased for both the `d_begin_offsets` and `d_end_offsets` parameters (where the latter is specified as `segment_offsets + 1`).
+StableSortKeys is stable: it preserves the relative ordering of equivalent elements. That is, if `x` and `y` are elements such that `x` precedes `y`, and if the two elements are equivalent (neither `x < y` nor `y < x`) then a postcondition of stable sort is that `x` still precedes `y`.
+Let `cur = d_keys.Current()` and `alt = d_keys.Alternate()`. The range `[cur, cur + num_items)` shall not overlap `[alt, alt + num_items)`. Both ranges shall not overlap `[d_begin_offsets, d_begin_offsets + num_segments)` nor `[d_end_offsets, d_end_offsets + num_segments)` in any way.
+Segments are not required to be contiguous. For all index values `i` outside the specified segments `d_keys.Current()[i]`, `d_keys[i].Alternate()[i]` will not be accessed nor modified.
+
+
+**Template parameters**
+
+
+**[inferred]** Key type
+
+
+
+**[inferred]** Random-access input iterator type for reading segment beginning offsets (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access input iterator type for reading segment ending offsets (may be a simple pointer type)
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When nullptr, the required allocation size is written to `temp_storage_bytes` and no work is done
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+
+
+
+The total number of items to sort (across all segments)
+
+
+
+The number of segments that comprise the sorting data
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of beginning offsets of
+//! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
+//! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
+//!
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of ending offsets of length
+//! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+//! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+//! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the ``i``-th segment is
+//! considered empty.
+//!
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+**Example**
+
+The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of `i` nt keys.
+
+```cpp showLineNumbers={false}
+#include
+// or equivalently
+
+// Declare, allocate, and initialize device-accessible pointers
+// for sorting data
+int num_items; // e.g., 7
+int num_segments; // e.g., 3
+int *d_offsets; // e.g., [0, 3, 3, 7]
+int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9]
+int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -]
+...
+
+// Create a DoubleBuffer to wrap the pair of device pointers
+cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf);
+
+// Determine temporary device storage requirements
+void *d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+cub::DeviceSegmentedSort::StableSortKeys(
+ d_temp_storage, temp_storage_bytes, d_keys,
+ num_items, num_segments, d_offsets, d_offsets + 1);
+
+// Allocate temporary storage
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Run sorting operation
+cub::DeviceSegmentedSort::StableSortKeys(
+ d_temp_storage, temp_storage_bytes, d_keys,
+ num_items, num_segments, d_offsets, d_offsets + 1);
+
+// d_keys.Current() <-- [6, 7, 8, 0, 3, 5, 9]
+```
+
+
+
+
+### StableSortKeysDescending inline static
+
+
+
+
+Sorts segments of keys into descending order. Approximately `num_items + 2 * num_segments` auxiliary storage required.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceSegmentedSort::StableSortKeysDescending(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ const KeyT *d_keys_in,
+ KeyT *d_keys_out,
+ ::cuda::std::int64_t num_items,
+ ::cuda::std::int64_t num_segments,
+ BeginOffsetIteratorT d_begin_offsets,
+ EndOffsetIteratorT d_end_offsets,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The contents of the input data are not altered by the sorting operation.
+When the input is a contiguous sequence of segments, a single sequence `segment_offsets` (of length `num_segments + 1`) can be aliased for both the `d_begin_offsets` and `d_end_offsets` parameters (where the latter is specified as `segment_offsets + 1`).
+StableSortKeysDescending is stable: it preserves the relative ordering of equivalent elements. That is, if `x` and `y` are elements such that `x` precedes `y`, and if the two elements are equivalent (neither `x < y` nor `y < x`) then a postcondition of stable sort is that `x` still precedes `y`.
+The range `[d_keys_out, d_keys_out + num_items)` shall not overlap `[d_keys_in, d_keys_in + num_items)`, `[d_begin_offsets, d_begin_offsets + num_segments)` nor `[d_end_offsets, d_end_offsets + num_segments)` in any way.
+Segments are not required to be contiguous. For all index values `i` outside the specified segments `d_keys_in[i]`, `d_keys_out[i]` will not be accessed nor modified.
+
+
+**Template parameters**
+
+
+**[inferred]** Key type
+
+
+
+**[inferred]** Random-access input iterator type for reading segment beginning offsets (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access input iterator type for reading segment ending offsets (may be a simple pointer type)
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When nullptr, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Device-accessible pointer to the input data of key data to sort
+
+
+
+Device-accessible pointer to the sorted output sequence of key data
+
+
+
+The total number of items to sort (across all segments)
+
+
+
+The number of segments that comprise the sorting data
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of beginning offsets of
+//! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
+//! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and
+//! ``d_values_*``
+//!
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of ending offsets of length
+//! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+//! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+//! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the ``i``-th segment is
+//! considered empty.
+//!
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+**Example**
+
+The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of `i` nt keys.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+// Declare, allocate, and initialize device-accessible pointers
+// for sorting data
+int num_items; // e.g., 7
+int num_segments; // e.g., 3
+int *d_offsets; // e.g., [0, 3, 3, 7]
+int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
+int *d_keys_out; // e.g., [-, -, -, -, -, -, -]
+...
+
+// Determine temporary device storage requirements
+void *d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+cub::DeviceSegmentedSort::StableSortKeysDescending(
+ d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
+ num_items, num_segments, d_offsets, d_offsets + 1);
+
+// Allocate temporary storage
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Run sorting operation
+cub::DeviceSegmentedSort::StableSortKeysDescending(
+ d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
+ num_items, num_segments, d_offsets, d_offsets + 1);
+
+// d_keys_out <-- [8, 7, 6, 9, 5, 3, 0]
+```
+
+
+
+
+Sorts segments of keys into descending order. Approximately `2 * num_segments` auxiliary storage required.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceSegmentedSort::StableSortKeysDescending(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ DoubleBuffer &d_keys,
+ ::cuda::std::int64_t num_items,
+ ::cuda::std::int64_t num_segments,
+ BeginOffsetIteratorT d_begin_offsets,
+ EndOffsetIteratorT d_end_offsets,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The sorting operation is given a pair of key buffers managed by a DoubleBuffer structure that indicates which of the two buffers is "current" (and thus contains the input data to be sorted).
+The contents of both buffers may be altered by the sorting operation.
+Upon completion, the sorting operation will update the "current" indicator within the DoubleBuffer wrapper to reference which of the two buffers now contains the sorted output sequence (a function of the number of key bits and the targeted device architecture).
+When the input is a contiguous sequence of segments, a single sequence `segment_offsets` (of length `num_segments + 1`) can be aliased for both the `d_begin_offsets` and `d_end_offsets` parameters (where the latter is specified as `segment_offsets + 1`).
+StableSortKeysDescending is stable: it preserves the relative ordering of equivalent elements. That is, if `x` and `y` are elements such that `x` precedes `y`, and if the two elements are equivalent (neither `x < y` nor `y < x`) then a postcondition of stable sort is that `x` still precedes `y`.
+Let `cur = d_keys.Current()` and `alt = d_keys.Alternate()`. The range `[cur, cur + num_items)` shall not overlap `[alt, alt + num_items)`. Both ranges shall not overlap `[d_begin_offsets, d_begin_offsets + num_segments)` nor `[d_end_offsets, d_end_offsets + num_segments)` in any way.
+Segments are not required to be contiguous. For all index values ```i` outside the specified segments `d_keys.Current()[i]`, `d_keys[i].Alternate()[i]` will not be accessed nor modified.
+
+
+**Template parameters**
+
+
+**[inferred]** Key type
+
+
+
+**[inferred]** Random-access input iterator type for reading segment beginning offsets (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access input iterator type for reading segment ending offsets (may be a simple pointer type)
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When nullptr, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+
+
+
+The total number of items to sort (across all segments)
+
+
+
+The number of segments that comprise the sorting data
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of beginning offsets of
+//! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
+//! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
+//!
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of ending offsets of length
+//! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last
+//! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and
+//! ``d_values_*``. If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the
+//! ``i``-th segment is considered empty.
+//!
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+**Example**
+
+The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of `i` nt keys.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+// Declare, allocate, and initialize device-accessible pointers
+// for sorting data
+int num_items; // e.g., 7
+int num_segments; // e.g., 3
+int *d_offsets; // e.g., [0, 3, 3, 7]
+int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9]
+int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -]
+...
+
+// Create a DoubleBuffer to wrap the pair of device pointers
+cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf);
+
+// Determine temporary device storage requirements
+void *d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+cub::DeviceSegmentedSort::StableSortKeysDescending(
+ d_temp_storage, temp_storage_bytes, d_keys,
+ num_items, num_segments, d_offsets, d_offsets + 1);
+
+// Allocate temporary storage
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Run sorting operation
+cub::DeviceSegmentedSort::StableSortKeysDescending(
+ d_temp_storage, temp_storage_bytes, d_keys,
+ num_items, num_segments, d_offsets, d_offsets + 1);
+
+// d_keys.Current() <-- [8, 7, 6, 9, 5, 3, 0]
+```
+
+
+
+
+---
+
+## Key-value pairs
+
+### SortPairsDescendingNoNVTX inline static
+
+
+
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceSegmentedSort::SortPairsDescendingNoNVTX(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ const KeyT *d_keys_in,
+ KeyT *d_keys_out,
+ const ValueT *d_values_in,
+ ValueT *d_values_out,
+ ::cuda::std::int64_t num_items,
+ ::cuda::std::int64_t num_segments,
+ BeginOffsetIteratorT d_begin_offsets,
+ EndOffsetIteratorT d_end_offsets,
+ cudaStream_t stream = 0
+)
+```
+
+
+
+
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceSegmentedSort::SortPairsDescendingNoNVTX(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ DoubleBuffer &d_keys,
+ DoubleBuffer &d_values,
+ ::cuda::std::int64_t num_items,
+ ::cuda::std::int64_t num_segments,
+ BeginOffsetIteratorT d_begin_offsets,
+ EndOffsetIteratorT d_end_offsets,
+ cudaStream_t stream = 0
+)
+```
+
+
+
+
+
+### SortPairsNoNVTX inline static
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceSegmentedSort::SortPairsNoNVTX(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ DoubleBuffer &d_keys,
+ DoubleBuffer &d_values,
+ ::cuda::std::int64_t num_items,
+ ::cuda::std::int64_t num_segments,
+ BeginOffsetIteratorT d_begin_offsets,
+ EndOffsetIteratorT d_end_offsets,
+ cudaStream_t stream = 0
+)
+```
+
+
+### SortPairs inline static
+
+
+
+
+Sorts segments of key-value pairs into ascending order. Approximately `2 * num_items + 2 * num_segments` auxiliary storage required.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceSegmentedSort::SortPairs(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ const KeyT *d_keys_in,
+ KeyT *d_keys_out,
+ const ValueT *d_values_in,
+ ValueT *d_values_out,
+ ::cuda::std::int64_t num_items,
+ ::cuda::std::int64_t num_segments,
+ BeginOffsetIteratorT d_begin_offsets,
+ EndOffsetIteratorT d_end_offsets,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The contents of the input data are not altered by the sorting operation.
+When the input is a contiguous sequence of segments, a single sequence `segment_offsets` (of length `num_segments + 1`) can be aliased for both the `d_begin_offsets` and `d_end_offsets` parameters (where the latter is specified as `segment_offsets + 1`).
+SortPairs is not guaranteed to be stable. That is, suppose that `i` and `j` are equivalent: neither one is less than the other. It is not guaranteed that the relative order of these two elements will be preserved by sort.
+Let `in` be one of `{d_keys_in, d_values_in}` and `out` be any of `{d_keys_out, d_values_out}`. The range `[out, out + num_items)` shall not overlap `[in, in + num_items)`, `[d_begin_offsets, d_begin_offsets + num_segments)` nor `[d_end_offsets, d_end_offsets + num_segments)` in any way.
+Segments are not required to be contiguous. For all index values `i` outside the specified segments `d_keys_in[i]`, `d_values_in[i]`, `d_keys_out[i]`, `d_values_out[i]` will not be accessed nor modified.
+
+
+**Template parameters**
+
+
+**[inferred]** Key type
+
+
+
+**[inferred]** Value type
+
+
+
+**[inferred]** Random-access input iterator type for reading segment beginning offsets (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access input iterator type for reading segment ending offsets (may be a simple pointer type)
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Device-accessible pointer to the input data of key data to sort
+
+
+
+Device-accessible pointer to the sorted output sequence of key data
+
+
+
+Device-accessible pointer to the corresponding input sequence of associated value items
+
+
+
+Device-accessible pointer to the correspondingly-reordered output sequence of associated value items
+
+
+
+The total number of items to sort (across all segments)
+
+
+
+The number of segments that comprise the sorting data
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of beginning offsets of
+//! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
+//! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
+//!
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of ending offsets of length
+//! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+//! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+//! If ``d_end_offsets[i]-1 <= d_begin_offsets[i]``, the ``i``-th segment is
+//! considered empty.
+//!
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+**Example**
+
+The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of `i` nt keys with associated vector of `i` nt values.
+
+```cpp showLineNumbers={false}
+#include
+// or equivalently
+
+// Declare, allocate, and initialize device-accessible pointers
+// for sorting data
+int num_items; // e.g., 7
+int num_segments; // e.g., 3
+int *d_offsets; // e.g., [0, 3, 3, 7]
+int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
+int *d_keys_out; // e.g., [-, -, -, -, -, -, -]
+int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6]
+int *d_values_out; // e.g., [-, -, -, -, -, -, -]
+...
+
+// Determine temporary device storage requirements
+void *d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+cub::DeviceSegmentedSort::SortPairs(
+ d_temp_storage, temp_storage_bytes,
+ d_keys_in, d_keys_out, d_values_in, d_values_out,
+ num_items, num_segments, d_offsets, d_offsets + 1);
+
+// Allocate temporary storage
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Run sorting operation
+cub::DeviceSegmentedSort::SortPairs(
+ d_temp_storage, temp_storage_bytes,
+ d_keys_in, d_keys_out, d_values_in, d_values_out,
+ num_items, num_segments, d_offsets, d_offsets + 1);
+
+// d_keys_out <-- [6, 7, 8, 0, 3, 5, 9]
+// d_values_out <-- [1, 2, 0, 5, 4, 3, 6]
+```
+
+
+
+
+Sorts segments of key-value pairs into ascending order. Approximately `2 * num_segments` auxiliary storage required.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceSegmentedSort::SortPairs(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ DoubleBuffer &d_keys,
+ DoubleBuffer &d_values,
+ ::cuda::std::int64_t num_items,
+ ::cuda::std::int64_t num_segments,
+ BeginOffsetIteratorT d_begin_offsets,
+ EndOffsetIteratorT d_end_offsets,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The sorting operation is given a pair of key buffers and a corresponding pair of associated value buffers. Each pair is managed by a DoubleBuffer structure that indicates which of the two buffers is "current" (and thus contains the input data to be sorted).
+The contents of both buffers within each pair may be altered by the sorting operation.
+Upon completion, the sorting operation will update the "current" indicator within each DoubleBuffer wrapper to reference which of the two buffers now contains the sorted output sequence (a function of the number of key bits specified and the targeted device architecture).
+When the input is a contiguous sequence of segments, a single sequence `segment_offsets` (of length `num_segments + 1`) can be aliased for both the `d_begin_offsets` and `d_end_offsets` parameters (where the latter is specified as `segment_offsets + 1`).
+SortPairs is not guaranteed to be stable. That is, suppose that `i` and `j` are equivalent: neither one is less than the other. It is not guaranteed that the relative order of these two elements will be preserved by sort.
+Let `cur` be one of `{d_keys.Current(), d_values.Current()}` and `alt` be any of `{d_keys.Alternate(), d_values.Alternate()}`. The range `[cur, cur + num_items)` shall not overlap `[alt, alt + num_items)`. Both ranges shall not overlap `[d_begin_offsets, d_begin_offsets + num_segments)` nor `[d_end_offsets, d_end_offsets + num_segments)` in any way.
+Segments are not required to be contiguous. For all index values `i` outside the specified segments `d_keys.Current()[i]`, `d_values.Current()[i]`, `d_keys.Alternate()[i]`, `d_values.Alternate()[i]` will not be accessed nor modified.
+
+
+**Template parameters**
+
+
+**[inferred]** Key type
+
+
+
+**[inferred]** Value type
+
+
+
+**[inferred]** Random-access input iterator type for reading segment beginning offsets (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access input iterator type for reading segment ending offsets (may be a simple pointer type)
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+
+
+
+Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
+
+
+
+The total number of items to sort (across all segments)
+
+
+
+The number of segments that comprise the sorting data
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of beginning offsets of
+//! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
+//! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
+//!
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of ending offsets of length
+//! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+//! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+//! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the i-th segment is
+//! considered empty.
+//!
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+**Example**
+
+The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of `i` nt keys with associated vector of `i` nt values.
+
+```cpp showLineNumbers={false}
+#include
+// or equivalently
+
+// Declare, allocate, and initialize device-accessible pointers
+// for sorting data
+int num_items; // e.g., 7
+int num_segments; // e.g., 3
+int *d_offsets; // e.g., [0, 3, 3, 7]
+int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9]
+int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -]
+int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6]
+int *d_value_alt_buf; // e.g., [-, -, -, -, -, -, -]
+...
+
+// Create a set of DoubleBuffers to wrap pairs of device pointers
+cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf);
+cub::DoubleBuffer d_values(d_value_buf, d_value_alt_buf);
+
+// Determine temporary device storage requirements
+void *d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+cub::DeviceSegmentedSort::SortPairs(
+ d_temp_storage, temp_storage_bytes, d_keys, d_values,
+ num_items, num_segments, d_offsets, d_offsets + 1);
+
+// Allocate temporary storage
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Run sorting operation
+cub::DeviceSegmentedSort::SortPairs(
+ d_temp_storage, temp_storage_bytes, d_keys, d_values,
+ num_items, num_segments, d_offsets, d_offsets + 1);
+
+// d_keys.Current() <-- [6, 7, 8, 0, 3, 5, 9]
+// d_values.Current() <-- [5, 4, 3, 1, 2, 0, 6]
+```
+
+
+
+
+### SortPairsDescending inline static
+
+
+
+
+Sorts segments of key-value pairs into descending order. Approximately `2 * num_items + 2 * num_segments` auxiliary storage required.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceSegmentedSort::SortPairsDescending(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ const KeyT *d_keys_in,
+ KeyT *d_keys_out,
+ const ValueT *d_values_in,
+ ValueT *d_values_out,
+ ::cuda::std::int64_t num_items,
+ ::cuda::std::int64_t num_segments,
+ BeginOffsetIteratorT d_begin_offsets,
+ EndOffsetIteratorT d_end_offsets,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The contents of the input data are not altered by the sorting operation.
+When the input is a contiguous sequence of segments, a single sequence `segment_offsets` (of length `num_segments + 1`) can be aliased for both the `d_begin_offsets` and `d_end_offsets` parameters (where the latter is specified as `segment_offsets + 1`).
+SortPairs is not guaranteed to be stable. That is, suppose that `i` and `j` are equivalent: neither one is less than the other. It is not guaranteed that the relative order of these two elements will be preserved by sort.
+Let `in` be one of `{d_keys_in, d_values_in}` and `out` be any of `{d_keys_out, d_values_out}`. The range `[out, out + num_items)` shall not overlap `[in, in + num_items)`, `[d_begin_offsets, d_begin_offsets + num_segments)` nor `[d_end_offsets, d_end_offsets + num_segments)` in any way.
+Segments are not required to be contiguous. For all index values `i` outside the specified segments `d_keys_in[i]`, `d_values_in[i]`, `d_keys_out[i]`, `d_values_out[i]` will not be accessed nor modified.
+
+
+**Template parameters**
+
+
+**[inferred]** Key type
+
+
+
+**[inferred]** Value type
+
+
+
+**[inferred]** Random-access input iterator type for reading segment beginning offsets (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access input iterator type for reading segment ending offsets (may be a simple pointer type)
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When nullptr, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Device-accessible pointer to the input data of key data to sort
+
+
+
+Device-accessible pointer to the sorted output sequence of key data
+
+
+
+Device-accessible pointer to the corresponding input sequence of associated value items
+
+
+
+Device-accessible pointer to the correspondingly-reordered output sequence of associated value items
+
+
+
+The total number of items to sort (across all segments)
+
+
+
+The number of segments that comprise the sorting data
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of beginning offsets of
+//! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
+//! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
+//!
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of ending offsets of length
+//! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+//! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+//! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the i-th segment is
+//! considered empty.
+//!
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+**Example**
+
+The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of `i` nt keys with associated vector of `i` nt values.
+
+```cpp showLineNumbers={false}
+#include
+// or equivalently
+
+// Declare, allocate, and initialize device-accessible pointers for
+// sorting data
+int num_items; // e.g., 7
+int num_segments; // e.g., 3
+int *d_offsets; // e.g., [0, 3, 3, 7]
+int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
+int *d_keys_out; // e.g., [-, -, -, -, -, -, -]
+int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6]
+int *d_values_out; // e.g., [-, -, -, -, -, -, -]
+...
+
+// Determine temporary device storage requirements
+void *d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+cub::DeviceSegmentedSort::SortPairsDescending(
+ d_temp_storage, temp_storage_bytes,
+ d_keys_in, d_keys_out, d_values_in, d_values_out,
+ num_items, num_segments, d_offsets, d_offsets + 1);
+
+// Allocate temporary storage
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Run sorting operation
+cub::DeviceSegmentedSort::SortPairsDescending(
+ d_temp_storage, temp_storage_bytes,
+ d_keys_in, d_keys_out, d_values_in, d_values_out,
+ num_items, num_segments, d_offsets, d_offsets + 1);
+
+// d_keys_out <-- [8, 7, 6, 9, 5, 3, 0]
+// d_values_out <-- [0, 2, 1, 6, 3, 4, 5]
+```
+
+
+
+
+Sorts segments of key-value pairs into descending order. Approximately `2 * num_segments` auxiliary storage required.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceSegmentedSort::SortPairsDescending(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ DoubleBuffer &d_keys,
+ DoubleBuffer &d_values,
+ ::cuda::std::int64_t num_items,
+ ::cuda::std::int64_t num_segments,
+ BeginOffsetIteratorT d_begin_offsets,
+ EndOffsetIteratorT d_end_offsets,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The sorting operation is given a pair of key buffers and a corresponding pair of associated value buffers. Each pair is managed by a DoubleBuffer structure that indicates which of the two buffers is "current" (and thus contains the input data to be sorted).
+The contents of both buffers within each pair may be altered by the sorting operation.
+Upon completion, the sorting operation will update the "current" indicator within each DoubleBuffer wrapper to reference which of the two buffers now contains the sorted output sequence (a function of the number of key bits specified and the targeted device architecture).
+When the input is a contiguous sequence of segments, a single sequence `segment_offsets` (of length `num_segments + 1`) can be aliased for both the `d_begin_offsets` and `d_end_offsets` parameters (where the latter is specified as `segment_offsets + 1`).
+SortPairsDescending is not guaranteed to be stable. That is, suppose that `i` and `j` are equivalent: neither one is less than the other. It is not guaranteed that the relative order of these two elements will be preserved by sort.
+Let `cur` be one of `{d_keys.Current(), d_values.Current()}` and `alt` be any of `{d_keys.Alternate(), d_values.Alternate()}`. The range `[cur, cur + num_items)` shall not overlap `[alt, alt + num_items)`. Both ranges shall not overlap `[d_begin_offsets, d_begin_offsets + num_segments)` nor `[d_end_offsets, d_end_offsets + num_segments)` in any way.
+Segments are not required to be contiguous. For all index values `i` outside the specified segments `d_keys.Current()[i]`, `d_values.Current()[i]`, `d_keys.Alternate()[i]`, `d_values.Alternate()[i]` will not be accessed nor modified.
+
+
+**Template parameters**
+
+
+**[inferred]** Key type
+
+
+
+**[inferred]** Value type
+
+
+
+**[inferred]** Random-access input iterator type for reading segment beginning offsets (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access input iterator type for reading segment ending offsets (may be a simple pointer type)
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When nullptr, the required allocation size is written to `temp_storage_bytes` and no work is done
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+
+
+
+Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
+
+
+
+The total number of items to sort (across all segments)
+
+
+
+The number of segments that comprise the sorting data
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of beginning offsets of
+//! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
+//! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
+//!
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of ending offsets of length
+//! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+//! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+//! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the ``i``-th segment is
+//! considered empty.
+//!
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+**Example**
+
+The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of `i` nt keys with associated vector of `i` nt values.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+// Declare, allocate, and initialize device-accessible pointers for
+// sorting data
+int num_items; // e.g., 7
+int num_segments; // e.g., 3
+int *d_offsets; // e.g., [0, 3, 3, 7]
+int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9]
+int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -]
+int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6]
+int *d_value_alt_buf; // e.g., [-, -, -, -, -, -, -]
+...
+
+// Create a set of DoubleBuffers to wrap pairs of device pointers
+cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf);
+cub::DoubleBuffer d_values(d_value_buf, d_value_alt_buf);
+
+// Determine temporary device storage requirements
+void *d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+cub::DeviceSegmentedSort::SortPairsDescending(
+ d_temp_storage, temp_storage_bytes, d_keys, d_values,
+ num_items, num_segments, d_offsets, d_offsets + 1);
+
+// Allocate temporary storage
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Run sorting operation
+cub::DeviceSegmentedSort::SortPairsDescending(
+ d_temp_storage, temp_storage_bytes, d_keys, d_values,
+ num_items, num_segments, d_offsets, d_offsets + 1);
+
+// d_keys.Current() <-- [8, 7, 6, 9, 5, 3, 0]
+// d_values.Current() <-- [0, 2, 1, 6, 3, 4, 5]
+```
+
+
+
+
+### StableSortPairs inline static
+
+
+
+
+Sorts segments of key-value pairs into ascending order. Approximately `2 * num_items + 2 * num_segments` auxiliary storage required.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceSegmentedSort::StableSortPairs(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ const KeyT *d_keys_in,
+ KeyT *d_keys_out,
+ const ValueT *d_values_in,
+ ValueT *d_values_out,
+ ::cuda::std::int64_t num_items,
+ ::cuda::std::int64_t num_segments,
+ BeginOffsetIteratorT d_begin_offsets,
+ EndOffsetIteratorT d_end_offsets,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The contents of the input data are not altered by the sorting operation.
+When the input is a contiguous sequence of segments, a single sequence `segment_offsets` (of length `num_segments + 1`) can be aliased for both the `d_begin_offsets` and `d_end_offsets` parameters (where the latter is specified as `segment_offsets + 1`).
+StableSortPairs is stable: it preserves the relative ordering of equivalent elements. That is, if `x` and `y` are elements such that `x` precedes `y`, and if the two elements are equivalent (neither `x < y` nor `y < x`) then a postcondition of stable sort is that `x` still precedes `y`.
+Let `in` be one of `{d_keys_in, d_values_in}` and `out` be any of `{d_keys_out, d_values_out}`. The range `[out, out + num_items)` shall not overlap `[in, in + num_items)`, `[d_begin_offsets, d_begin_offsets + num_segments)` nor `[d_end_offsets, d_end_offsets + num_segments)` in any way.
+Segments are not required to be contiguous. For all index values `i` outside the specified segments `d_keys_in[i]`, `d_values_in[i]`, `d_keys_out[i]`, `d_values_out[i]` will not be accessed nor modified.
+
+
+**Template parameters**
+
+
+**[inferred]** Key type
+
+
+
+**[inferred]** Value type
+
+
+
+**[inferred]** Random-access input iterator type for reading segment beginning offsets (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access input iterator type for reading segment ending offsets (may be a simple pointer type)
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When nullptr, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Device-accessible pointer to the input data of key data to sort
+
+
+
+Device-accessible pointer to the sorted output sequence of key data
+
+
+
+Device-accessible pointer to the corresponding input sequence of associated value items
+
+
+
+Device-accessible pointer to the correspondingly-reordered output sequence of associated value items
+
+
+
+The total number of items to sort (across all segments)
+
+
+
+The number of segments that comprise the sorting data
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of beginning offsets of
+//! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
+//! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
+//!
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of ending offsets of length
+//! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+//! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+//! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the ``i``-th segment is
+//! considered empty.
+//!
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+**Example**
+
+The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of `i` nt keys with associated vector of `i` nt values.
+
+```cpp showLineNumbers={false}
+#include
+// or equivalently
+
+// Declare, allocate, and initialize device-accessible pointers
+// for sorting data
+int num_items; // e.g., 7
+int num_segments; // e.g., 3
+int *d_offsets; // e.g., [0, 3, 3, 7]
+int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
+int *d_keys_out; // e.g., [-, -, -, -, -, -, -]
+int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6]
+int *d_values_out; // e.g., [-, -, -, -, -, -, -]
+...
+
+// Determine temporary device storage requirements
+void *d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+cub::DeviceSegmentedSort::StableSortPairs(
+ d_temp_storage, temp_storage_bytes,
+ d_keys_in, d_keys_out, d_values_in, d_values_out,
+ num_items, num_segments, d_offsets, d_offsets + 1);
+
+// Allocate temporary storage
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Run sorting operation
+cub::DeviceSegmentedSort::StableSortPairs(
+ d_temp_storage, temp_storage_bytes,
+ d_keys_in, d_keys_out, d_values_in, d_values_out,
+ num_items, num_segments, d_offsets, d_offsets + 1);
+
+// d_keys_out <-- [6, 7, 8, 0, 3, 5, 9]
+// d_values_out <-- [1, 2, 0, 5, 4, 3, 6]
+```
+
+
+
+
+Sorts segments of key-value pairs into ascending order. Approximately `2 * num_segments` auxiliary storage required.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceSegmentedSort::StableSortPairs(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ DoubleBuffer &d_keys,
+ DoubleBuffer &d_values,
+ ::cuda::std::int64_t num_items,
+ ::cuda::std::int64_t num_segments,
+ BeginOffsetIteratorT d_begin_offsets,
+ EndOffsetIteratorT d_end_offsets,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The sorting operation is given a pair of key buffers and a corresponding pair of associated value buffers. Each pair is managed by a DoubleBuffer structure that indicates which of the two buffers is "current" (and thus contains the input data to be sorted).
+The contents of both buffers within each pair may be altered by the sorting operation.
+Upon completion, the sorting operation will update the "current" indicator within each DoubleBuffer wrapper to reference which of the two buffers now contains the sorted output sequence (a function of the number of key bits specified and the targeted device architecture).
+When the input is a contiguous sequence of segments, a single sequence `segment_offsets` (of length `num_segments + 1`) can be aliased for both the `d_begin_offsets` and `d_end_offsets` parameters (where the latter is specified as `segment_offsets + 1`).
+StableSortPairs is stable: it preserves the relative ordering of equivalent elements. That is, if `x` and `y` are elements such that `x` precedes `y`, and if the two elements are equivalent (neither `x < y` nor `y < x`) then a postcondition of stable sort is that `x` still precedes `y`.
+Let `cur` be one of `{d_keys.Current(), d_values.Current()}` and `alt` be any of `{d_keys.Alternate(), d_values.Alternate()}`. The range `[cur, cur + num_items)` shall not overlap `[alt, alt + num_items)`. Both ranges shall not overlap `[d_begin_offsets, d_begin_offsets + num_segments)` nor `[d_end_offsets, d_end_offsets + num_segments)` in any way.
+Segments are not required to be contiguous. For all index values `i` outside the specified segments `d_keys.Current()[i]`, `d_values.Current()[i]`, `d_keys.Alternate()[i]`, `d_values.Alternate()[i]` will not be accessed nor modified.
+
+
+**Template parameters**
+
+
+**[inferred]** Key type
+
+
+
+**[inferred]** Value type
+
+
+
+**[inferred]** Random-access input iterator type for reading segment beginning offsets (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access input iterator type for reading segment ending offsets (may be a simple pointer type)
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+
+
+
+Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
+
+
+
+The total number of items to sort (across all segments)
+
+
+
+The number of segments that comprise the sorting data
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of beginning offsets of
+//! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
+//! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
+//!
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of ending offsets of length
+//! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+//! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+//! If ``d_end_offsets[i]-1 <= d_begin_offsets[i]``, the ``i``-th segment is
+//! considered empty.
+//!
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+**Example**
+
+The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of `i` nt keys with associated vector of `i` nt values.
+
+```cpp showLineNumbers={false}
+#include
+// or equivalently
+
+// Declare, allocate, and initialize device-accessible pointers
+// for sorting data
+int num_items; // e.g., 7
+int num_segments; // e.g., 3
+int *d_offsets; // e.g., [0, 3, 3, 7]
+int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9]
+int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -]
+int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6]
+int *d_value_alt_buf; // e.g., [-, -, -, -, -, -, -]
+...
+
+// Create a set of DoubleBuffers to wrap pairs of device pointers
+cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf);
+cub::DoubleBuffer d_values(d_value_buf, d_value_alt_buf);
+
+// Determine temporary device storage requirements
+void *d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+cub::DeviceSegmentedSort::StableSortPairs(
+ d_temp_storage, temp_storage_bytes, d_keys, d_values,
+ num_items, num_segments, d_offsets, d_offsets + 1);
+
+// Allocate temporary storage
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Run sorting operation
+cub::DeviceSegmentedSort::StableSortPairs(
+ d_temp_storage, temp_storage_bytes, d_keys, d_values,
+ num_items, num_segments, d_offsets, d_offsets + 1);
+
+// d_keys.Current() <-- [6, 7, 8, 0, 3, 5, 9]
+// d_values.Current() <-- [5, 4, 3, 1, 2, 0, 6]
+```
+
+
+
+
+### StableSortPairsDescending inline static
+
+
+
+
+Sorts segments of key-value pairs into descending order. Approximately `2 * num_items + 2 * num_segments` auxiliary storage required.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceSegmentedSort::StableSortPairsDescending(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ const KeyT *d_keys_in,
+ KeyT *d_keys_out,
+ const ValueT *d_values_in,
+ ValueT *d_values_out,
+ ::cuda::std::int64_t num_items,
+ ::cuda::std::int64_t num_segments,
+ BeginOffsetIteratorT d_begin_offsets,
+ EndOffsetIteratorT d_end_offsets,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The contents of the input data are not altered by the sorting operation.
+When the input is a contiguous sequence of segments, a single sequence `segment_offsets` (of length `num_segments + 1`) can be aliased for both the `d_begin_offsets` and `d_end_offsets` parameters (where the latter is specified as `segment_offsets + 1`).
+StableSortPairsDescending is stable: it preserves the relative ordering of equivalent elements. That is, if `x` and `y` are elements such that `x` precedes `y`, and if the two elements are equivalent (neither `x < y` nor `y < x`) then a postcondition of stable sort is that `x` still precedes `y`.
+Let `in` be one of `{d_keys_in, d_values_in}` and `out` be any of `{d_keys_out, d_values_out}`. The range `[out, out + num_items)` shall not overlap `[in, in + num_items)`, `[d_begin_offsets, d_begin_offsets + num_segments)` nor `[d_end_offsets, d_end_offsets + num_segments)` in any way.
+Segments are not required to be contiguous. For all index values `i` outside the specified segments `d_keys_in[i]`, `d_values_in[i]`, `d_keys_out[i]`, `d_values_out[i]` will not be accessed nor modified.
+
+
+**Template parameters**
+
+
+**[inferred]** Key type
+
+
+
+**[inferred]** Value type
+
+
+
+**[inferred]** Random-access input iterator type for reading segment beginning offsets (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access input iterator type for reading segment ending offsets (may be a simple pointer type)
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Device-accessible pointer to the input data of key data to sort
+
+
+
+Device-accessible pointer to the sorted output sequence of key data
+
+
+
+Device-accessible pointer to the corresponding input sequence of associated value items
+
+
+
+Device-accessible pointer to the correspondingly-reordered output sequence of associated value items
+
+
+
+The total number of items to sort (across all segments)
+
+
+
+The number of segments that comprise the sorting data
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of beginning offsets of
+//! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
+//! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
+//!
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of ending offsets of length
+//! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+//! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+//! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the ``i``-th segment is
+//! considered empty.
+//!
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+**Example**
+
+The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of `i` nt keys with associated vector of `i` nt values.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+// Declare, allocate, and initialize device-accessible pointers
+// for sorting data
+int num_items; // e.g., 7
+int num_segments; // e.g., 3
+int *d_offsets; // e.g., [0, 3, 3, 7]
+int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
+int *d_keys_out; // e.g., [-, -, -, -, -, -, -]
+int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6]
+int *d_values_out; // e.g., [-, -, -, -, -, -, -]
+...
+
+// Determine temporary device storage requirements
+void *d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+cub::DeviceSegmentedSort::StableSortPairsDescending(
+ d_temp_storage, temp_storage_bytes,
+ d_keys_in, d_keys_out, d_values_in, d_values_out,
+ num_items, num_segments, d_offsets, d_offsets + 1);
+
+// Allocate temporary storage
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Run sorting operation
+cub::DeviceSegmentedSort::StableSortPairsDescending(
+ d_temp_storage, temp_storage_bytes,
+ d_keys_in, d_keys_out, d_values_in, d_values_out,
+ num_items, num_segments, d_offsets, d_offsets + 1);
+
+// d_keys_out <-- [8, 7, 6, 9, 5, 3, 0]
+// d_values_out <-- [0, 2, 1, 6, 3, 4, 5]
+```
+
+
+
+
+Sorts segments of key-value pairs into descending order. Approximately `2 * num_segments` auxiliary storage required.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceSegmentedSort::StableSortPairsDescending(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ DoubleBuffer &d_keys,
+ DoubleBuffer &d_values,
+ ::cuda::std::int64_t num_items,
+ ::cuda::std::int64_t num_segments,
+ BeginOffsetIteratorT d_begin_offsets,
+ EndOffsetIteratorT d_end_offsets,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The sorting operation is given a pair of key buffers and a corresponding pair of associated value buffers. Each pair is managed by a DoubleBuffer structure that indicates which of the two buffers is "current" (and thus contains the input data to be sorted).
+The contents of both buffers within each pair may be altered by the sorting operation.
+Upon completion, the sorting operation will update the "current" indicator within each DoubleBuffer wrapper to reference which of the two buffers now contains the sorted output sequence (a function of the number of key bits specified and the targeted device architecture).
+When the input is a contiguous sequence of segments, a single sequence `segment_offsets` (of length `num_segments + 1`) can be aliased for both the `d_begin_offsets` and `d_end_offsets` parameters (where the latter is specified as `segment_offsets + 1`).
+StableSortPairsDescending is stable: it preserves the relative ordering of equivalent elements. That is, if `x` and `y` are elements such that `x` precedes `y`, and if the two elements are equivalent (neither `x < y` nor `y < x`) then a postcondition of stable sort is that `x` still precedes `y`.
+Let `cur` be one of `{d_keys.Current(), d_values.Current()}` and `alt` be any of `{d_keys.Alternate(), d_values.Alternate()}`. The range `[cur, cur + num_items)` shall not overlap `[alt, alt + num_items)`. Both ranges shall not overlap `[d_begin_offsets, d_begin_offsets + num_segments)` nor `[d_end_offsets, d_end_offsets + num_segments)` in any way.
+Segments are not required to be contiguous. For all index values `i` outside the specified segments `d_keys.Current()[i]`, `d_values.Current()[i]`, `d_keys.Alternate()[i]`, `d_values.Alternate()[i]` will not be accessed nor modified.
+
+
+**Template parameters**
+
+
+**[inferred]** Key type
+
+
+
+**[inferred]** Value type
+
+
+
+**[inferred]** Random-access input iterator type for reading segment beginning offsets (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access input iterator type for reading segment ending offsets (may be a simple pointer type)
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+
+
+
+Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
+
+
+
+The total number of items to sort (across all segments)
+
+
+
+The number of segments that comprise the sorting data
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of beginning offsets of
+//! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
+//! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
+//!
+
+
+
+Embed:rst:leading-asterisk
+//! Random-access input iterator to the sequence of ending offsets of length
+//! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+//! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+//! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the ``i``-th segment is
+//! considered empty.
+//!
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+**Example**
+
+The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of `i` nt keys with associated vector of `i` nt values.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+// Declare, allocate, and initialize device-accessible pointers
+// for sorting data
+int num_items; // e.g., 7
+int num_segments; // e.g., 3
+int *d_offsets; // e.g., [0, 3, 3, 7]
+int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9]
+int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -]
+int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6]
+int *d_value_alt_buf; // e.g., [-, -, -, -, -, -, -]
+...
+
+// Create a set of DoubleBuffers to wrap pairs of device pointers
+cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf);
+cub::DoubleBuffer d_values(d_value_buf, d_value_alt_buf);
+
+// Determine temporary device storage requirements
+void *d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+cub::DeviceSegmentedSort::StableSortPairsDescending(
+ d_temp_storage, temp_storage_bytes, d_keys, d_values,
+ num_items, num_segments, d_offsets, d_offsets + 1);
+
+// Allocate temporary storage
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Run sorting operation
+cub::DeviceSegmentedSort::StableSortPairsDescending(
+ d_temp_storage, temp_storage_bytes, d_keys, d_values,
+ num_items, num_segments, d_offsets, d_offsets + 1);
+
+// d_keys.Current() <-- [8, 7, 6, 9, 5, 3, 0]
+// d_values.Current() <-- [0, 2, 1, 6, 3, 4, 5]
+```
+
+
+
+
+---
+
+## Utility methods
+
+### GetName inline static constexpr
+
+
+```cpp showLineNumbers={false}
+static constexpr const char * cub::DeviceSegmentedSort::GetName()
+```
+
+
+### SortKeysNoNVTX inline static
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceSegmentedSort::SortKeysNoNVTX(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ const KeyT *d_keys_in,
+ KeyT *d_keys_out,
+ ::cuda::std::int64_t num_items,
+ ::cuda::std::int64_t num_segments,
+ BeginOffsetIteratorT d_begin_offsets,
+ EndOffsetIteratorT d_end_offsets,
+ cudaStream_t stream = 0
+)
+```
+
diff --git a/fern/cudapages/cub/cub/cub/DeviceSelect.mdx b/fern/cudapages/cub/cub/cub/DeviceSelect.mdx
new file mode 100644
index 0000000..89581c6
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/DeviceSelect.mdx
@@ -0,0 +1,1326 @@
+---
+title: cub::DeviceSelect
+description: ""
+---
+
+DeviceSelect provides device-wide, parallel operations for compacting selected items from sequences of data items residing within device-accessible memory. It is similar to DevicePartition, except that non-selected items are discarded, whereas DevicePartition retains them.
+
+## Performance considerations
+
+@linear_performance{select-flagged, select-if, and select-unique}
+
+---
+
+## Methods
+
+### select_impl inline static
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceSelect::select_impl(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ InputIteratorT d_in,
+ FlagIteratorT d_flags,
+ OutputIteratorT d_out,
+ NumSelectedIteratorT d_num_selected_out,
+ OffsetT num_items,
+ SelectOpT select_op,
+ EqualityOpT equality_op,
+ cudaStream_t stream
+)
+```
+
+
+---
+
+## Static methods
+
+### Flagged inline static
+
+
+
+
+Uses the `d_flags` sequence to selectively copy the corresponding items from `d_in` into `d_out`. The total number of items selected is written to `d_num_selected_out`.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceSelect::Flagged(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ InputIteratorT d_in,
+ FlagIterator d_flags,
+ OutputIteratorT d_out,
+ NumSelectedIteratorT d_num_selected_out,
+ ::cuda::std::int64_t num_items,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The value type of `d_flags` must be castable to `bool` (e.g., `bool`, `char`, `int`, etc.).
+Copies of the selected items are compacted into `d_out` and maintain their original relative ordering.
+| The range `[d_out, d_out + *d_num_selected_out)` shall not overlap `[d_in, d_in + num_items)`, | `[d_flags, d_flags + num_items)` nor `d_num_selected_out` in any way.
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access input iterator type for reading input items (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access input iterator type for reading selection flags (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access output iterator type for writing selected items (may be a simple pointer type)
+
+
+
+**[inferred]** Output iterator type for recording the number of items selected (may be a simple pointer type)
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Pointer to the input sequence of data items
+
+
+
+Pointer to the input sequence of selection flags
+
+
+
+Pointer to the output sequence of selected data items
+
+
+
+Pointer to the output total number of items selected (i.e., length of `d_out`)
+
+
+
+Total number of input items (i.e., length of `d_in`)
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+**Example**
+
+The code snippet below illustrates the compaction of items selected from an `int` device vector.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+// Declare, allocate, and initialize device-accessible pointers for input,
+// flags, and output
+int num_items; // e.g., 8
+int *d_in; // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
+char *d_flags; // e.g., [1, 0, 0, 1, 0, 1, 1, 0]
+int *d_out; // e.g., [ , , , , , , , ]
+int *d_num_selected_out; // e.g., [ ]
+...
+
+// Determine temporary device storage requirements
+void *d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+cub::DeviceSelect::Flagged(
+ d_temp_storage, temp_storage_bytes,
+ d_in, d_flags, d_out, d_num_selected_out, num_items);
+
+// Allocate temporary storage
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Run selection
+cub::DeviceSelect::Flagged(
+ d_temp_storage, temp_storage_bytes,
+ d_in, d_flags, d_out, d_num_selected_out, num_items);
+
+// d_out <-- [1, 4, 6, 7]
+// d_num_selected_out <-- [4]
+```
+
+
+
+
+nodiscard
+
+Uses the `d_flags` sequence to selectively copy the corresponding items from `d_in` into `d_out`. The total number of items selected is written to `d_num_selected_out`.
+
+This is an environment-based API that allows customization of:
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceSelect::Flagged(
+ InputIteratorT d_in,
+ FlagIterator d_flags,
+ OutputIteratorT d_out,
+ NumSelectedIteratorT d_num_selected_out,
+ NumItemsT num_items,
+ EnvT env = {}
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+Stream: Query via `cuda::get_stream`
+Memory resource: Query via `cuda::mr::get_memory_resource`
+The value type of `d_flags` must be castable to `bool` (e.g., `bool`, `char`, `int`, etc.).
+Copies of the selected items are compacted into `d_out` and maintain their original relative ordering.
+| The range `[d_out, d_out + *d_num_selected_out)` shall not overlap `[d_in, d_in + num_items)`, | `[d_flags, d_flags + num_items)` nor `d_num_selected_out` in any way.
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access input iterator type for reading input items (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access input iterator type for reading selection flags (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access output iterator type for writing selected items (may be a simple pointer type)
+
+
+
+**[inferred]** Output iterator type for recording the number of items selected (may be a simple pointer type)
+
+
+
+**[inferred]** Type of num_items
+
+
+
+**[inferred]** Environment type (e.g., `cuda::std::execution::env<...>`)
+
+
+**Parameters**
+
+
+Pointer to the input sequence of data items
+
+
+
+Pointer to the input sequence of selection flags
+
+
+
+Pointer to the output sequence of selected data items
+
+
+
+Pointer to the output total number of items selected (i.e., length of `d_out`)
+
+
+
+Total number of input items (i.e., length of `d_in`)
+
+
+
+**[optional]** Execution environment. Default is `cuda::std::execution::env{}`.
+
+
+
+
+
+Uses the `d_flags` sequence to selectively compact the items in `d_data``. The total number of items selected is written to `d_num_selected_out`.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceSelect::Flagged(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ IteratorT d_data,
+ FlagIterator d_flags,
+ NumSelectedIteratorT d_num_selected_out,
+ ::cuda::std::int64_t num_items,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The value type of `d_flags` must be castable to `bool` (e.g., `bool`, `char`, `int`, etc.).
+Copies of the selected items are compacted in-place and maintain their original relative ordering.
+| The `d_data` may equal `d_flags`. The range `[d_data, d_data + num_items)` shall not overlap | `[d_flags, d_flags + num_items)` in any other way.
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access iterator type for reading and writing selected items (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access input iterator type for reading selection flags (may be a simple pointer type)
+
+
+
+**[inferred]** Output iterator type for recording the number of items selected (may be a simple pointer type)
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Pointer to the sequence of data items
+
+
+
+Pointer to the input sequence of selection flags
+
+
+
+Pointer to the output total number of items selected
+
+
+
+Total number of input items (i.e., length of `d_data`)
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+**Example**
+
+The code snippet below illustrates the compaction of items selected from an `int` device vector.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+// Declare, allocate, and initialize device-accessible pointers for input,
+// flags, and output
+int num_items; // e.g., 8
+int *d_data; // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
+char *d_flags; // e.g., [1, 0, 0, 1, 0, 1, 1, 0]
+int *d_num_selected_out; // e.g., [ ]
+...
+
+// Determine temporary device storage requirements
+void *d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+cub::DeviceSelect::Flagged(
+ d_temp_storage, temp_storage_bytes,
+ d_in, d_flags, d_num_selected_out, num_items);
+
+// Allocate temporary storage
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Run selection
+cub::DeviceSelect::Flagged(
+ d_temp_storage, temp_storage_bytes,
+ d_in, d_flags, d_num_selected_out, num_items);
+
+// d_data <-- [1, 4, 6, 7]
+// d_num_selected_out <-- [4]
+```
+
+
+
+
+### If inline static
+
+
+
+
+nodiscard
+
+Uses the `select_op` functor to selectively copy items from `d_in` into `d_out`. The total number of items selected is written to `d_num_selected_out`.
+
+This is an environment-based API that allows customization of:
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceSelect::If(
+ InputIteratorT d_in,
+ OutputIteratorT d_out,
+ NumSelectedIteratorT d_num_selected_out,
+ NumItemsT num_items,
+ SelectOp select_op,
+ EnvT env = {}
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+Stream: Query via `cuda::get_stream`
+Memory resource: Query via `cuda::mr::get_memory_resource`
+Copies of the selected items are compacted into `d_out` and maintain their original relative ordering.
+| The range `[d_out, d_out + *d_num_selected_out)` shall not overlap | `[d_in, d_in + num_items)` nor `d_num_selected_out` in any way.
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access input iterator type for reading input items (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access output iterator type for writing selected items (may be a simple pointer type)
+
+
+
+**[inferred]** Output iterator type for recording the number of items selected (may be a simple pointer type)
+
+
+
+**[inferred]** Selection operator type having member `bool operator()(const T &a)`
+
+
+
+**[inferred]** Type of num_items
+
+
+
+**[inferred]** Environment type (e.g., `cuda::std::execution::env<...>`)
+
+
+**Parameters**
+
+
+Pointer to the input sequence of data items
+
+
+
+Pointer to the output sequence of selected data items
+
+
+
+Pointer to the output total number of items selected (i.e., length of `d_out`)
+
+
+
+Total number of input items (i.e., length of `d_in`)
+
+
+
+Unary selection operator
+
+
+
+**[optional]** Execution environment. Default is `cuda::std::execution::env{}`.
+
+
+
+
+
+Uses the `select_op` functor to selectively copy items from `d_in` into `d_out`. The total number of items selected is written to `d_num_selected_out`.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceSelect::If(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ InputIteratorT d_in,
+ OutputIteratorT d_out,
+ NumSelectedIteratorT d_num_selected_out,
+ ::cuda::std::int64_t num_items,
+ SelectOp select_op,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+Copies of the selected items are compacted into `d_out` and maintain their original relative ordering.
+| The range `[d_out, d_out + *d_num_selected_out)` shall not overlap | `[d_in, d_in + num_items)` nor `d_num_selected_out` in any way.
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access input iterator type for reading input items (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access output iterator type for writing selected items (may be a simple pointer type)
+
+
+
+**[inferred]** Output iterator type for recording the number of items selected (may be a simple pointer type)
+
+
+
+**[inferred]** Selection operator type having member `bool operator()(const T &a)`
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Pointer to the input sequence of data items
+
+
+
+Pointer to the output sequence of selected data items
+
+
+
+Pointer to the output total number of items selected (i.e., length of `d_out`)
+
+
+
+Total number of input items (i.e., length of `d_in`)
+
+
+
+Unary selection operator
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+**Example**
+
+The code snippet below illustrates the compaction of items selected from an `int` device vector.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+// Functor type for selecting values less than some criteria
+struct LessThan
+{
+ int compare;
+
+ __host__ __device__ __forceinline__
+ LessThan(int compare) : compare(compare) {}
+
+ __host__ __device__ __forceinline__
+ bool operator()(const int &a) const {
+ return (a < compare);
+ }
+};
+
+// Declare, allocate, and initialize device-accessible pointers
+// for input and output
+int num_items; // e.g., 8
+int *d_in; // e.g., [0, 2, 3, 9, 5, 2, 81, 8]
+int *d_out; // e.g., [ , , , , , , , ]
+int *d_num_selected_out; // e.g., [ ]
+LessThan select_op(7);
+...
+
+// Determine temporary device storage requirements
+void *d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+cub::DeviceSelect::If(
+ d_temp_storage, temp_storage_bytes,
+ d_in, d_out, d_num_selected_out, num_items, select_op);
+
+// Allocate temporary storage
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Run selection
+cub::DeviceSelect::If(
+ d_temp_storage, temp_storage_bytes,
+ d_in, d_out, d_num_selected_out, num_items, select_op);
+
+// d_out <-- [0, 2, 3, 5, 2]
+// d_num_selected_out <-- [5]
+```
+
+
+
+
+Uses the `select_op` functor to selectively compact items in `d_data`. The total number of items selected is written to `d_num_selected_out`.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceSelect::If(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ IteratorT d_data,
+ NumSelectedIteratorT d_num_selected_out,
+ ::cuda::std::int64_t num_items,
+ SelectOp select_op,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+| Copies of the selected items are compacted in `d_data` and maintain | their original relative ordering.
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access input iterator type for reading and writing items (may be a simple pointer type)
+
+
+
+**[inferred]** Output iterator type for recording the number of items selected (may be a simple pointer type)
+
+
+
+**[inferred]** Selection operator type having member `bool operator()(const T &a)`
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Pointer to the sequence of data items
+
+
+
+Pointer to the output total number of items selected
+
+
+
+Total number of input items (i.e., length of `d_data`)
+
+
+
+Unary selection operator
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+**Example**
+
+The code snippet below illustrates the compaction of items selected from an `int` device vector.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+// Functor type for selecting values less than some criteria
+struct LessThan
+{
+ int compare;
+
+ __host__ __device__ __forceinline__
+ LessThan(int compare) : compare(compare) {}
+
+ __host__ __device__ __forceinline__
+ bool operator()(const int &a) const {
+ return (a < compare);
+ }
+};
+
+// Declare, allocate, and initialize device-accessible pointers
+// for input and output
+int num_items; // e.g., 8
+int *d_data; // e.g., [0, 2, 3, 9, 5, 2, 81, 8]
+int *d_num_selected_out; // e.g., [ ]
+LessThan select_op(7);
+...
+
+// Determine temporary device storage requirements
+void *d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+cub::DeviceSelect::If(
+ d_temp_storage, temp_storage_bytes,
+ d_data, d_num_selected_out, num_items, select_op);
+
+// Allocate temporary storage
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Run selection
+cub::DeviceSelect::If(
+ d_temp_storage, temp_storage_bytes,
+ d_data, d_num_selected_out, num_items, select_op);
+
+// d_data <-- [0, 2, 3, 5, 2]
+// d_num_selected_out <-- [5]
+```
+
+
+
+
+### FlaggedIf inline static
+
+
+
+
+Uses the `select_op` functor applied to `d_flags` to selectively copy the corresponding items from `d_in` into `d_out`. The total number of items selected is written to `d_num_selected_out`.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceSelect::FlaggedIf(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ InputIteratorT d_in,
+ FlagIterator d_flags,
+ OutputIteratorT d_out,
+ NumSelectedIteratorT d_num_selected_out,
+ ::cuda::std::int64_t num_items,
+ SelectOp select_op,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The expression `select_op(flag)` must be convertible to `bool`, where the type of `flag` corresponds to the value type of `FlagIterator`.
+Copies of the selected items are compacted into `d_out` and maintain their original relative ordering.
+| The range `[d_out, d_out + *d_num_selected_out)` shall not overlap | `[d_in, d_in + num_items)` nor `d_num_selected_out` in any way.
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access input iterator type for reading input items (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access input iterator type for reading selection flags (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access output iterator type for writing selected items (may be a simple pointer type)
+
+
+
+**[inferred]** Output iterator type for recording the number of items selected (may be a simple pointer type)
+
+
+
+**[inferred]** Selection operator type having member `bool operator()(const T &a)`
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Pointer to the input sequence of data items
+
+
+
+Pointer to the input sequence of selection flags
+
+
+
+Pointer to the output sequence of selected data items
+
+
+
+Pointer to the output total number of items selected (i.e., length of `d_out`)
+
+
+
+Total number of input items (i.e., length of `d_in`)
+
+
+
+Unary selection operator
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+
+
+
+Uses the `select_op` functor applied to `d_flags` to selectively compact the corresponding items in `d_data`. The total number of items selected is written to `d_num_selected_out`.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceSelect::FlaggedIf(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ IteratorT d_data,
+ FlagIterator d_flags,
+ NumSelectedIteratorT d_num_selected_out,
+ ::cuda::std::int64_t num_items,
+ SelectOp select_op,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The expression `select_op(flag)` must be convertible to `bool`, where the type of `flag` corresponds to the value type of `FlagIterator`.
+Copies of the selected items are compacted in-place and maintain their original relative ordering.
+| The `d_data` may equal `d_flags`. The range `[d_data, d_data + num_items)` shall not overlap | `[d_flags, d_flags + num_items)` in any other way.
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access iterator type for reading and writing selected items (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access input iterator type for reading selection flags (may be a simple pointer type)
+
+
+
+**[inferred]** Output iterator type for recording the number of items selected (may be a simple pointer type)
+
+
+
+**[inferred]** Selection operator type having member `bool operator()(const T &a)`
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Pointer to the sequence of data items
+
+
+
+Pointer to the input sequence of selection flags
+
+
+
+Pointer to the output total number of items selected
+
+
+
+Total number of input items (i.e., length of `d_data`)
+
+
+
+Unary selection operator
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+
+
+
+### Unique inline static
+
+Given an input sequence `d_in` having runs of consecutive equal-valued keys, only the first key from each run is selectively copied to `d_out`. The total number of items selected is written to `d_num_selected_out`.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceSelect::Unique(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ InputIteratorT d_in,
+ OutputIteratorT d_out,
+ NumSelectedIteratorT d_num_selected_out,
+ ::cuda::std::int64_t num_items,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The `==` equality operator is used to determine whether keys are equivalent
+Copies of the selected items are compacted into `d_out` and maintain their original relative ordering.
+| The range `[d_out, d_out + *d_num_selected_out)` shall not overlap | `[d_in, d_in + num_items)` nor `d_num_selected_out` in any way.
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access input iterator type for reading input items (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access output iterator type for writing selected items (may be a simple pointer type)
+
+
+
+**[inferred]** Output iterator type for recording the number of items selected (may be a simple pointer type)
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Pointer to the input sequence of data items
+
+
+
+Pointer to the output sequence of selected data items
+
+
+
+Pointer to the output total number of items selected (i.e., length of `d_out`)
+
+
+
+Total number of input items (i.e., length of `d_in`)
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+**Example**
+
+The code snippet below illustrates the compaction of items selected from an `int` device vector.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+// Declare, allocate, and initialize device-accessible pointers
+// for input and output
+int num_items; // e.g., 8
+int *d_in; // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
+int *d_out; // e.g., [ , , , , , , , ]
+int *d_num_selected_out; // e.g., [ ]
+...
+
+// Determine temporary device storage requirements
+void *d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+cub::DeviceSelect::Unique(
+ d_temp_storage, temp_storage_bytes,
+ d_in, d_out, d_num_selected_out, num_items);
+
+// Allocate temporary storage
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Run selection
+cub::DeviceSelect::Unique(
+ d_temp_storage, temp_storage_bytes,
+ d_in, d_out, d_num_selected_out, num_items);
+
+// d_out <-- [0, 2, 9, 5, 8]
+// d_num_selected_out <-- [5]
+```
+
+### UniqueByKey inline static
+
+
+
+
+Given an input sequence `d_keys_in` and `d_values_in` with runs of key-value pairs with consecutive equal-valued keys, only the first key and its value from each run is selectively copied to `d_keys_out` and `d_values_out`. The total number of items selected is written to `d_num_selected_out`.
+
+
+```cpp showLineNumbers={false}
+template
+static ::cuda::std::enable_if_t, cudaError_t> cub::DeviceSelect::UniqueByKey(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ KeyInputIteratorT d_keys_in,
+ ValueInputIteratorT d_values_in,
+ KeyOutputIteratorT d_keys_out,
+ ValueOutputIteratorT d_values_out,
+ NumSelectedIteratorT d_num_selected_out,
+ NumItemsT num_items,
+ EqualityOpT equality_op,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The user-provided equality operator, `equality_op`, is used to determine whether keys are equivalent
+Copies of the selected items are compacted into `d_out` and maintain their original relative ordering.
+In-place operations are not supported. There must be no overlap between any of the provided ranges:
+`[d_keys_in, d_keys_in + num_items)`
+`[d_keys_out, d_keys_out + *d_num_selected_out)`
+`[d_values_in, d_values_in + num_items)`
+`[d_values_out, d_values_out + *d_num_selected_out)`
+`[d_num_selected_out, d_num_selected_out + 1)`
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access input iterator type for reading input keys (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access input iterator type for reading input values (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access output iterator type for writing selected keys (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access output iterator type for writing selected values (may be a simple pointer type)
+
+
+
+**[inferred]** Output iterator type for recording the number of items selected (may be a simple pointer type)
+
+
+
+**[inferred]** Type of num_items
+
+
+
+**[inferred]** Type of equality_op
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Pointer to the input sequence of keys
+
+
+
+Pointer to the input sequence of values
+
+
+
+Pointer to the output sequence of selected keys
+
+
+
+Pointer to the output sequence of selected values
+
+
+
+Pointer to the total number of items selected (i.e., length of `d_keys_out` or `d_values_out`)
+
+
+
+Total number of input items (i.e., length of `d_keys_in` or `d_values_in`)
+
+
+
+Binary predicate to determine equality
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+**Example**
+
+The code snippet below illustrates the compaction of items selected from an `int` device vector.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+// Declare, allocate, and initialize device-accessible pointers
+// for input and output
+int num_items; // e.g., 8
+int *d_keys_in; // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
+int *d_values_in; // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
+int *d_keys_out; // e.g., [ , , , , , , , ]
+int *d_values_out; // e.g., [ , , , , , , , ]
+int *d_num_selected_out; // e.g., [ ]
+...
+
+// Determine temporary device storage requirements
+void *d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+cub::DeviceSelect::UniqueByKey(
+ d_temp_storage, temp_storage_bytes,
+ d_keys_in, d_values_in,
+ d_keys_out, d_values_out, d_num_selected_out, num_items);
+
+// Allocate temporary storage
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Run selection
+cub::DeviceSelect::UniqueByKey(
+ d_temp_storage, temp_storage_bytes,
+ d_keys_in, d_values_in,
+ d_keys_out, d_values_out, d_num_selected_out, num_items);
+
+// d_keys_out <-- [0, 2, 9, 5, 8]
+// d_values_out <-- [1, 2, 4, 5, 8]
+// d_num_selected_out <-- [5]
+```
+
+
+
+
+Given an input sequence `d_keys_in` and `d_values_in` with runs of key-value pairs with consecutive equal-valued keys, only the first key and its value from each run is selectively copied to `d_keys_out` and `d_values_out`. The total number of items selected is written to `d_num_selected_out`.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceSelect::UniqueByKey(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ KeyInputIteratorT d_keys_in,
+ ValueInputIteratorT d_values_in,
+ KeyOutputIteratorT d_keys_out,
+ ValueOutputIteratorT d_values_out,
+ NumSelectedIteratorT d_num_selected_out,
+ NumItemsT num_items,
+ cudaStream_t stream = 0
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The `==` equality operator is used to determine whether keys are equivalent
+Copies of the selected items are compacted into `d_out` and maintain their original relative ordering.
+In-place operations are not supported. There must be no overlap between any of the provided ranges:
+`[d_keys_in, d_keys_in + num_items)`
+`[d_keys_out, d_keys_out + *d_num_selected_out)`
+`[d_values_in, d_values_in + num_items)`
+`[d_values_out, d_values_out + *d_num_selected_out)`
+`[d_num_selected_out, d_num_selected_out + 1)`
+@devicestorage
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access input iterator type for reading input keys (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access input iterator type for reading input values (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access output iterator type for writing selected keys (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access output iterator type for writing selected values (may be a simple pointer type)
+
+
+
+**[inferred]** Output iterator type for recording the number of items selected (may be a simple pointer type)
+
+
+
+**[inferred]** Type of num_items
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Pointer to the input sequence of keys
+
+
+
+Pointer to the input sequence of values
+
+
+
+Pointer to the output sequence of selected keys
+
+
+
+Pointer to the output sequence of selected values
+
+
+
+Pointer to the total number of items selected (i.e., length of `d_keys_out` or `d_values_out`)
+
+
+
+Total number of input items (i.e., length of `d_keys_in` or `d_values_in`)
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+//!
+
+
+**Example**
+
+The code snippet below illustrates the compaction of items selected from an `int` device vector.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+// Declare, allocate, and initialize device-accessible pointers
+// for input and output
+int num_items; // e.g., 8
+int *d_keys_in; // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
+int *d_values_in; // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
+int *d_keys_out; // e.g., [ , , , , , , , ]
+int *d_values_out; // e.g., [ , , , , , , , ]
+int *d_num_selected_out; // e.g., [ ]
+...
+
+// Determine temporary device storage requirements
+void *d_temp_storage = nullptr;
+size_t temp_storage_bytes = 0;
+cub::DeviceSelect::UniqueByKey(
+ d_temp_storage, temp_storage_bytes,
+ d_keys_in, d_values_in,
+ d_keys_out, d_values_out, d_num_selected_out, num_items);
+
+// Allocate temporary storage
+cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+// Run selection
+cub::DeviceSelect::UniqueByKey(
+ d_temp_storage, temp_storage_bytes,
+ d_keys_in, d_values_in,
+ d_keys_out, d_values_out, d_num_selected_out, num_items);
+
+// d_keys_out <-- [0, 2, 9, 5, 8]
+// d_values_out <-- [1, 2, 4, 5, 8]
+// d_num_selected_out <-- [5]
+```
+
+
+
diff --git a/fern/cudapages/cub/cub/cub/DeviceTopK.mdx b/fern/cudapages/cub/cub/cub/DeviceTopK.mdx
new file mode 100644
index 0000000..76713a5
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/DeviceTopK.mdx
@@ -0,0 +1,338 @@
+---
+title: cub::DeviceTopK
+description: ""
+---
+
+DeviceTopK provides device-wide, parallel operations for finding the largest (or smallest) K items from sequences of unordered data items residing within device-accessible memory.
+
+## Performance considerations
+
+@linear_performance{top-k}
+
+---
+
+## Static methods
+
+### MaxPairs inline static
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceTopK::MaxPairs(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ KeyInputIteratorT d_keys_in,
+ KeyOutputIteratorT d_keys_out,
+ ValueInputIteratorT d_values_in,
+ ValueOutputIteratorT d_values_out,
+ NumItemsT num_items,
+ NumOutItemsT k,
+ EnvT env = {}
+)
+```
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access input iterator type for reading input keys (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access output iterator type for writing output keys (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access input iterator type for reading input values (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access input iterator type for writing output values (may be a simple pointer type)
+
+
+
+The integral type of variable num_items
+
+
+
+The integral type of variable k
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Random-access iterator to the input sequence containing the keys
+
+
+
+Random-access iterator to the output sequence of keys, where K values will be written to
+
+
+
+Random-access iterator to the input sequence containing the values associated to each key
+
+
+
+Random-access iterator to the output sequence of values, corresponding to the top k keys, where k values will be written to
+
+
+
+Number of items to be read and processed from `d_keys_in` and `d_values_in` each
+
+
+
+The value of K, which is the number of largest pairs to find from `num_items` pairs. Capped to a maximum of `num_items`.
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** Execution environment. Default is `cuda::std::execution::env{}`.
+//!
+
+
+### MinPairs inline static
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceTopK::MinPairs(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ KeyInputIteratorT d_keys_in,
+ KeyOutputIteratorT d_keys_out,
+ ValueInputIteratorT d_values_in,
+ ValueOutputIteratorT d_values_out,
+ NumItemsT num_items,
+ NumOutItemsT k,
+ EnvT env = {}
+)
+```
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access input iterator type for reading input keys (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access output iterator type for writing output keys (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access input iterator type for reading input values (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access input iterator type for writing output values (may be a simple pointer type)
+
+
+
+The integral type of variable num_items
+
+
+
+The integral type of variable k
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Random-access iterator to the input sequence containing the keys
+
+
+
+Random-access iterator to the output sequence of keys, where K values will be written to
+
+
+
+Random-access iterator to the input sequence containing the values associated to each key
+
+
+
+Random-access iterator to the output sequence of values, corresponding to the top k keys, where k values will be written to
+
+
+
+Number of items to be read and processed from `d_keys_in` and `d_values_in` each
+
+
+
+The value of K, which is the number of lowest pairs to find from `num_items` pairs. Capped to a maximum of `num_items`.
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** Execution environment. Default is `cuda::std::execution::env{}`.
+//!
+
+
+### MaxKeys inline static
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceTopK::MaxKeys(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ KeyInputIteratorT d_keys_in,
+ KeyOutputIteratorT d_keys_out,
+ NumItemsT num_items,
+ NumOutItemsT k,
+ EnvT env = {}
+)
+```
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access input iterator type for reading input keys (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access output iterator type for writing output keys (may be a simple pointer type)
+
+
+
+The integral type of variable num_items
+
+
+
+The integral type of variable k
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Random-access iterator to the input sequence containing the keys
+
+
+
+Random-access iterator to the output sequence of keys, where K values will be written to
+
+
+
+Number of items to be read and processed from `d_keys_in`
+
+
+
+The value of K, which is the number of largest pairs to find from `num_items` pairs. Capped to a maximum of `num_items`.
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** Execution environment. Default is `cuda::std::execution::env{}`.
+//!
+
+
+### MinKeys inline static
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceTopK::MinKeys(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ KeyInputIteratorT d_keys_in,
+ KeyOutputIteratorT d_keys_out,
+ NumItemsT num_items,
+ NumOutItemsT k,
+ EnvT env = {}
+)
+```
+
+
+**Template parameters**
+
+
+**[inferred]** Random-access input iterator type for reading input keys (may be a simple pointer type)
+
+
+
+**[inferred]** Random-access output iterator type for writing output keys (may be a simple pointer type)
+
+
+
+The integral type of variable num_items
+
+
+
+The integral type of variable k
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+Random-access iterator to the input sequence containing the keys
+
+
+
+Random-access iterator to the output sequence of keys, where K values will be written to
+
+
+
+Number of items to be read and processed from `d_keys_in`
+
+
+
+The value of K, which is the number of largest pairs to find from `num_items` pairs. Capped to a maximum of `num_items`.
+
+
+
+Embed:rst:leading-asterisk
+//! **[optional]** Execution environment. Default is `cuda::std::execution::env{}`.
+//!
+
diff --git a/fern/cudapages/cub/cub/cub/DeviceTransform.mdx b/fern/cudapages/cub/cub/cub/DeviceTransform.mdx
new file mode 100644
index 0000000..cdbf879
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/DeviceTransform.mdx
@@ -0,0 +1,476 @@
+---
+title: cub::DeviceTransform
+description: "[DeviceTransform](/library/api/cub::_device_transform) provides device-wide, parallel operations for transforming elements tuple-wise from multiple input sequences into an output sequence."
+---
+
+`DeviceTransform` provides device-wide, parallel operations for transforming elements tuple-wise from multiple input sequences into an output sequence.
+
+---
+
+## Methods
+
+### TransformInternal inline static
+
+
+
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceTransform::TransformInternal(
+ ::cuda::std::tuple inputs,
+ RandomAccessIteratorOut output,
+ NumItemsT num_items,
+ Predicate predicate,
+ TransformOp transform_op,
+ Env env
+)
+```
+
+
+
+
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceTransform::TransformInternal(
+ ::cuda::std::tuple inputs,
+ ::cuda::std::tuple outputs,
+ NumItemsT num_items,
+ Predicate predicate,
+ TransformOp transform_op,
+ Env env
+)
+```
+
+
+
+
+
+---
+
+## Static methods
+
+### Transform inline static
+
+
+
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceTransform::Transform(
+ ::cuda::std::tuple inputs,
+ ::cuda::std::tuple outputs,
+ NumItemsT num_items,
+ TransformOp transform_op,
+ Env env = {}
+)
+```
+
+
+**Parameters**
+
+
+A tuple of iterators to the input sequences where num_items elements are read from each. The iterators' value types must be trivially relocatable.
+
+
+
+A tuple of iterators to the output sequences where num_items results are written to each. Each sequence may point to the beginning of one of the input sequences, performing the transformation inplace. Any output sequence must not overlap with any of the input sequence in any other way.
+
+
+
+The number of elements in each input and output sequence.
+
+
+
+An n-ary function object, where n is the number of input sequences. The input iterators' value types must be convertible to the parameters of the function object's call operator. The return type of the call operator must be a tuple where each tuple element is assignable to the corresponding dereferenced output iterators.
+
+
+
+Execution environment, or cudaStream_t. Default is `cuda::std::execution::env{}`, which will run on stream\ :sub:`0`
+
+
+
+
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceTransform::Transform(
+ ::cuda::std::tuple inputs,
+ RandomAccessIteratorOut output,
+ NumItemsT num_items,
+ TransformOp transform_op,
+ Env env = {}
+)
+```
+
+
+*Added in v2.8.0. First appears in CUDA Toolkit 12.9.*
+
+**Parameters**
+
+
+A tuple of iterators to the input sequences where num_items elements are read from each.
+
+
+
+An iterator to the output sequence where num_items results are written to. May point to the beginning of one of the input sequences, performing the transformation inplace. The output sequence must not overlap with any of the input sequence in any other way.
+
+
+
+The number of elements in each input sequence.
+
+
+
+An n-ary function object, where n is the number of input sequences. The input iterators' value types must be convertible to the parameters of the function object's call operator. The return type of the call operator must be assignable to the dereferenced output iterator.
+
+
+
+Execution environment, or cudaStream_t. Default is `cuda::std::execution::env{}`, which will run on stream\ :sub:`0`
+
+
+
+
+
+Transforms one input sequence into one output sequence, by applying a transformation operation on each input element and writing the result to the corresponding output element. No guarantee is given on the identity (i.e. address) of the objects passed to the call operator of the transformation operation.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceTransform::Transform(
+ RandomAccessIteratorIn input,
+ RandomAccessIteratorOut output,
+ NumItemsT num_items,
+ TransformOp transform_op,
+ Env env = {}
+)
+```
+
+
+*Added in v2.8.0. First appears in CUDA Toolkit 12.9.*
+
+**Parameters**
+
+
+An iterator to the input sequence where num_items elements are read from.
+
+
+
+An iterator to the output sequence where num_items results are written to. May point to the same sequence as `input`, performing the transformation inplace. The output sequence must not overlap with the input sequence in any other way.
+
+
+
+The number of elements in each input sequence.
+
+
+
+A unary function object. The input iterator's value type must be convertible to the parameter of the function object's call operator. The return type of the call operator must be assignable to the dereferenced output iterator.
+
+
+
+Execution environment, or cudaStream_t. Default is `cuda::std::execution::env{}`, which will run on stream\ :sub:`0`
+
+
+
+
+
+### Generate inline static
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceTransform::Generate(
+ RandomAccessIteratorOut output,
+ NumItemsT num_items,
+ Generator generator,
+ Env env = {}
+)
+```
+
+
+*Added in v2.8.0. First appears in CUDA Toolkit 12.9.*
+
+**Parameters**
+
+
+An iterator to the output sequence where num_items results are written to.
+
+
+
+The number of elements to write to the output sequence.
+
+
+
+A nullary function object. The return type of the call operator must be assignable to the dereferenced output iterator.
+
+
+
+Execution environment, or cudaStream_t. Default is `cuda::std::execution::env{}`, which will run on stream\ :sub:`0`
+
+
+### Fill inline static
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceTransform::Fill(
+ RandomAccessIteratorOut output,
+ NumItemsT num_items,
+ Value value,
+ Env env = {}
+)
+```
+
+
+*Added in v2.8.0. First appears in CUDA Toolkit 12.9.*
+
+**Parameters**
+
+
+An iterator to the output sequence where num_items results are written to.
+
+
+
+The number of elements to write to the output sequence.
+
+
+
+The value to write. Must be assignable to the dereferenced output iterator.
+
+
+
+Execution environment, or cudaStream_t. Default is `cuda::std::execution::env{}`, which will run on stream\ :sub:`0`
+
+
+### TransformIf inline static
+
+
+
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceTransform::TransformIf(
+ ::cuda::std::tuple inputs,
+ RandomAccessIteratorOut output,
+ NumItemsT num_items,
+ Predicate predicate,
+ TransformOp transform_op,
+ Env env = {}
+)
+```
+
+
+*Added in v2.8.0. First appears in CUDA Toolkit 12.9.*
+
+**Parameters**
+
+
+A tuple of iterators to the input sequences where num_items elements are read from each.
+
+
+
+An iterator to the output sequence where num_items results are written to. May point to the beginning of one of the input sequences, performing the transformation inplace. The output sequence must not overlap with any of the input sequence in any other way.
+
+
+
+The number of elements in each input sequence.
+
+
+
+An n-ary function object, where n is the number of input sequences. The input iterators' value types must be convertible to the parameters of the function object's call operator, which must return a boolean value.
+
+
+
+An n-ary function object, where n is the number of input sequences. The input iterators' value types must be convertible to the parameters of the function object's call operator. The return type of the call operator must be assignable to the dereferenced output iterator. Will only be invoked if `predicate` returns true.
+
+
+
+Execution environment, or cudaStream_t. Default is `cuda::std::execution::env{}`, which will run on stream\ :sub:`0`
+
+
+
+
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceTransform::TransformIf(
+ RandomAccessIteratorIn input,
+ RandomAccessIteratorOut output,
+ NumItemsT num_items,
+ Predicate predicate,
+ TransformOp transform_op,
+ Env env = {}
+)
+```
+
+
+*Added in v2.8.0. First appears in CUDA Toolkit 12.9.*
+
+**Parameters**
+
+
+An iterator to the input sequence where num_items elements are read from.
+
+
+
+An iterator to the output sequence where num_items results are written to. May point to the same sequence as `input`, performing the transformation inplace. The output sequence must not overlap with the input sequence in any other way.
+
+
+
+The number of elements in each input sequence.
+
+
+
+A unary function objects returning `bool`. The input iterators' value types must be convertible to the parameters of the function object's call operator.
+
+
+
+A unary function object. The input iterator's value type must be convertible to the parameter of the function object's call operator. The return type of the call operator must be assignable to the dereferenced output iterator. Will only be invoked if `predicate` returns true.
+
+
+
+Execution environment, or cudaStream_t. Default is `cuda::std::execution::env{}`, which will run on stream\ :sub:`0`
+
+
+
+
+
+### TransformStableArgumentAddresses inline static
+
+
+
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceTransform::TransformStableArgumentAddresses(
+ ::cuda::std::tuple inputs,
+ RandomAccessIteratorOut output,
+ NumItemsT num_items,
+ TransformOp transform_op,
+ Env env = {}
+)
+```
+
+
+*Added in v2.8.0. First appears in CUDA Toolkit 12.9.*
+
+**Parameters**
+
+
+A tuple of iterators to the input sequences where num_items elements are read from each.
+
+
+
+An iterator to the output sequence where num_items results are written to. May point to the beginning of one of the input sequences, performing the transformation inplace. The output sequence must not overlap with any of the input sequence in any other way.
+
+
+
+The number of elements in each input sequence.
+
+
+
+An n-ary function object, where n is the number of input sequences. The input iterators' value types must be convertible to the parameters of the function object's call operator. The return type of the call operator must be assignable to the dereferenced output iterator.
+
+
+
+Execution environment, or cudaStream_t. Default is `cuda::std::execution::env{}`, which will run on stream\ :sub:`0`
+
+
+
+
+
+Transforms one input sequence into one output sequence, by applying a transformation operation on corresponding input elements and writing the result to the corresponding output element. The objects passed to the call operator of the transformation operation are guaranteed to reside in the input sequences and are never copied.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DeviceTransform::TransformStableArgumentAddresses(
+ RandomAccessIteratorIn input,
+ RandomAccessIteratorOut output,
+ NumItemsT num_items,
+ TransformOp transform_op,
+ Env env = {}
+)
+```
+
+
+*Added in v2.8.0. First appears in CUDA Toolkit 12.9.*
+
+**Parameters**
+
+
+An iterator to the input sequence where num_items elements are read from.
+
+
+
+An iterator to the output sequence where num_items results are written to. May point to the beginning of one of the input sequences, performing the transformation inplace. The output sequence must not overlap with any of the input sequence in any other way.
+
+
+
+The number of elements in each input sequence.
+
+
+
+An n-ary function object, where n is the number of input sequences. The input iterators' value types must be convertible to the parameters of the function object's call operator. The return type of the call operator must be assignable to the dereferenced output iterator.
+
+
+
+Execution environment, or cudaStream_t. Default is `cuda::std::execution::env{}`, which will run on stream\ :sub:`0`
+
+
+
+
diff --git a/fern/cudapages/cub/cub/cub/DispatchAdjacentDifference.mdx b/fern/cudapages/cub/cub/cub/DispatchAdjacentDifference.mdx
new file mode 100644
index 0000000..a757d8d
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/DispatchAdjacentDifference.mdx
@@ -0,0 +1,110 @@
+---
+title: cub::DispatchAdjacentDifference
+description: ""
+---
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+
+## Constructors
+
+### DispatchAdjacentDifference inline
+
+
+```cpp showLineNumbers={false}
+cub::DispatchAdjacentDifference::DispatchAdjacentDifference(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ InputIteratorT d_input,
+ OutputIteratorT d_output,
+ OffsetT num_items,
+ DifferenceOpT difference_op,
+ cudaStream_t stream
+)
+```
+
+
+---
+
+## Methods
+
+### Invoke inline
+
+Invocation.
+
+
+```cpp showLineNumbers={false}
+template
+cudaError_t cub::DispatchAdjacentDifference::Invoke()
+```
+
+
+---
+
+## Static methods
+
+### Dispatch inline static
+
+
+```cpp showLineNumbers={false}
+static cudaError_t cub::DispatchAdjacentDifference::Dispatch(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ InputIteratorT d_input,
+ OutputIteratorT d_output,
+ OffsetT num_items,
+ DifferenceOpT difference_op,
+ cudaStream_t stream
+)
+```
+
+
+---
+
+## Types
+
+### Typedefs
+
+| Name | Definition |
+|---|---|
+| `InputT` | `detail::it_value_t< InputIteratorT >` |
+
+---
+
+## Member variables
+
+| Name | Type | Description |
+|---|---|---|
+| `d_temp_storage` | `void *` | |
+| `temp_storage_bytes` | `size_t &` | |
+| `d_input` | `InputIteratorT` | |
+| `d_output` | `OutputIteratorT` | |
+| `num_items` | `OffsetT` | |
+| `difference_op` | `DifferenceOpT` | |
+| `stream` | `cudaStream_t` | |
diff --git a/fern/cudapages/cub/cub/cub/DispatchHistogram.mdx b/fern/cudapages/cub/cub/cub/DispatchHistogram.mdx
new file mode 100644
index 0000000..6cd7af0
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/DispatchHistogram.mdx
@@ -0,0 +1,661 @@
+---
+title: cub::DispatchHistogram
+description: "Utility class for dispatching the appropriately-tuned kernels for [DeviceHistogram](/library/api/cub::_device_histogram)."
+---
+
+Utility class for dispatching the appropriately-tuned kernels for [DeviceHistogram](/library/api/cub::_device_histogram).
+
+
+
+
+
+Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
+
+
+
+Number of channels actively being histogrammed
+
+
+
+Random-access input iterator type for reading input items (may be a simple pointer type)
+
+
+
+Integer type for counting sample occurrences per histogram bin
+
+
+
+Type for specifying bin level boundaries
+
+
+
+Signed integer type for global offsets
+
+
+
+Implementation detail, do not specify directly, requirements on the content of this type are subject to breaking change.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+
+## Static methods
+
+### DispatchRange inline static
+
+
+
+
+Dispatch routine for HistogramRange with host-side decode operator initialization, specialized for sample types larger than 8bit.
+
+This variant initializes the decode operators on the host before kernel launch.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DispatchHistogram::DispatchRange(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ SampleIteratorT d_samples,
+ ::cuda::std::array d_output_histograms,
+ ::cuda::std::array num_output_levels,
+ ::cuda::std::array d_levels,
+ OffsetT num_row_pixels,
+ OffsetT num_rows,
+ OffsetT row_stride_samples,
+ cudaStream_t stream,
+ ::cuda::std::false_type,
+ KernelSource kernel_source = {},
+ KernelLauncherFactory launcher_factory = {},
+ MaxPolicyT max_policy = {}
+)
+```
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When nullptr, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
+
+
+
+The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of `d_histograms[i]` should be `num_output_levels[i] - 1`.
+
+
+
+The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channeli is `num_output_levels[i] - 1`.
+
+
+
+The pointers to the arrays of boundaries (levels), one for each active channel. Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
+
+
+
+The number of multi-channel pixels per row in the region of interest
+
+
+
+The number of rows in the region of interest
+
+
+
+The number of samples between starts of consecutive rows in the region of interest
+
+
+
+CUDA stream to launch kernels within. Default is stream0.
+
+
+
+
+
+Dispatch routine for HistogramRange with host-side decode operator initialization, specialized for 8-bit sample types (computes 256-bin privatized histograms and then reduces to user-specified levels).
+
+This variant initializes the decode operators on the host before kernel launch.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DispatchHistogram::DispatchRange(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ SampleIteratorT d_samples,
+ ::cuda::std::array d_output_histograms,
+ ::cuda::std::array num_output_levels,
+ ::cuda::std::array d_levels,
+ OffsetT num_row_pixels,
+ OffsetT num_rows,
+ OffsetT row_stride_samples,
+ cudaStream_t stream,
+ ::cuda::std::true_type,
+ KernelSource kernel_source = {},
+ KernelLauncherFactory launcher_factory = {},
+ MaxPolicyT max_policy = {}
+)
+```
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When nullptr, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
+
+
+
+The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of `d_histograms[i]` should be `num_output_levels[i] - 1`.
+
+
+
+The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channeli is `num_output_levels[i] - 1`.
+
+
+
+The pointers to the arrays of boundaries (levels), one for each active channel. Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
+
+
+
+The number of multi-channel pixels per row in the region of interest
+
+
+
+The number of rows in the region of interest
+
+
+
+The number of samples between starts of consecutive rows in the region of interest
+
+
+
+CUDA stream to launch kernels within. Default is stream0.
+
+
+
+
+
+### DispatchEven inline static
+
+
+
+
+Dispatch routine for HistogramEven with host-side decode operator initialization, specialized for sample types larger than 8-bit.
+
+This variant initializes the decode operators on the host before kernel launch.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DispatchHistogram::DispatchEven(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ SampleIteratorT d_samples,
+ ::cuda::std::array d_output_histograms,
+ ::cuda::std::array num_output_levels,
+ ::cuda::std::array lower_level,
+ ::cuda::std::array upper_level,
+ OffsetT num_row_pixels,
+ OffsetT num_rows,
+ OffsetT row_stride_samples,
+ cudaStream_t stream,
+ ::cuda::std::false_type,
+ KernelSource kernel_source = {},
+ KernelLauncherFactory launcher_factory = {},
+ MaxPolicyT max_policy = {}
+)
+```
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When nullptr, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
+
+
+
+The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of `d_histograms[i]` should be `num_output_levels[i] - 1`.
+
+
+
+The number of bin level boundaries for delineating histogram samples in each active channel. Implies that the number of bins for channeli is `num_output_levels[i] - 1`.
+
+
+
+The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
+
+
+
+The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
+
+
+
+The number of multi-channel pixels per row in the region of interest
+
+
+
+The number of rows in the region of interest
+
+
+
+The number of samples between starts of consecutive rows in the region of interest
+
+
+
+CUDA stream to launch kernels within. Default is stream0.
+
+
+
+
+
+Dispatch routine for HistogramEven with host-side decode operator initialization, specialized for 8-bit sample types (computes 256-bin privatized histograms and then reduces to user-specified levels).
+
+This variant initializes the decode operators on the host before kernel launch.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DispatchHistogram::DispatchEven(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ SampleIteratorT d_samples,
+ ::cuda::std::array d_output_histograms,
+ ::cuda::std::array num_output_levels,
+ ::cuda::std::array lower_level,
+ ::cuda::std::array upper_level,
+ OffsetT num_row_pixels,
+ OffsetT num_rows,
+ OffsetT row_stride_samples,
+ cudaStream_t stream,
+ ::cuda::std::true_type,
+ KernelSource kernel_source = {},
+ KernelLauncherFactory launcher_factory = {},
+ MaxPolicyT max_policy = {}
+)
+```
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When nullptr, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
+
+
+
+The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of `d_histograms[i]` should be `num_output_levels[i] - 1`.
+
+
+
+The number of bin level boundaries for delineating histogram samples in each active channel. Implies that the number of bins for channeli is `num_output_levels[i] - 1`.
+
+
+
+The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
+
+
+
+The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
+
+
+
+The number of multi-channel pixels per row in the region of interest
+
+
+
+The number of rows in the region of interest
+
+
+
+The number of samples between starts of consecutive rows in the region of interest
+
+
+
+CUDA stream to launch kernels within. Default is stream0.
+
+
+
+
+
+### __dispatch_range_device_init inline static
+
+
+
+
+Dispatch routine for HistogramRange with device-side decode operator initialization, specialized for sample types larger than 8bit.
+
+This variant initializes the decode operators inside the kernel from level arrays.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DispatchHistogram::__dispatch_range_device_init(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ SampleIteratorT d_samples,
+ ::cuda::std::array d_output_histograms,
+ NumOutputLevelsArrayT num_output_levels,
+ LevelsArrayT d_levels,
+ OffsetT num_row_pixels,
+ OffsetT num_rows,
+ OffsetT row_stride_samples,
+ cudaStream_t stream,
+ ::cuda::std::false_type,
+ KernelSource kernel_source = {},
+ KernelLauncherFactory launcher_factory = {},
+ MaxPolicyT max_policy = {}
+)
+```
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When nullptr, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
+
+
+
+The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of `d_histograms[i]` should be `num_output_levels[i] - 1`.
+
+
+
+The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channeli is `num_output_levels[i] - 1`.
+
+
+
+The pointers to the arrays of boundaries (levels), one for each active channel. Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
+
+
+
+The number of multi-channel pixels per row in the region of interest
+
+
+
+The number of rows in the region of interest
+
+
+
+The number of samples between starts of consecutive rows in the region of interest
+
+
+
+CUDA stream to launch kernels within. Default is stream0.
+
+
+
+
+
+Dispatch routine for HistogramRange with device-side decode operator initialization, specialized for 8-bit sample types (computes 256-bin privatized histograms and then reduces to user-specified levels).
+
+This variant initializes the decode operators inside the kernel from level arrays.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DispatchHistogram::__dispatch_range_device_init(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ SampleIteratorT d_samples,
+ ::cuda::std::array d_output_histograms,
+ NumOutputLevelsArrayT num_output_levels,
+ LevelsArrayT d_levels,
+ OffsetT num_row_pixels,
+ OffsetT num_rows,
+ OffsetT row_stride_samples,
+ cudaStream_t stream,
+ ::cuda::std::true_type,
+ KernelSource kernel_source = {},
+ KernelLauncherFactory launcher_factory = {},
+ MaxPolicyT max_policy = {}
+)
+```
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When nullptr, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
+
+
+
+The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of `d_histograms[i]` should be `num_output_levels[i] - 1`.
+
+
+
+The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channeli is `num_output_levels[i] - 1`.
+
+
+
+The pointers to the arrays of boundaries (levels), one for each active channel. Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
+
+
+
+The number of multi-channel pixels per row in the region of interest
+
+
+
+The number of rows in the region of interest
+
+
+
+The number of samples between starts of consecutive rows in the region of interest
+
+
+
+CUDA stream to launch kernels within. Default is stream0.
+
+
+
+
+
+### __dispatch_even_device_init inline static
+
+
+
+
+Dispatch routine for HistogramEven with device-side decode operator initialization, specialized for sample types larger than 8-bit.
+
+This variant initializes the decode operators inside the kernel from level bounds.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DispatchHistogram::__dispatch_even_device_init(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ SampleIteratorT d_samples,
+ ::cuda::std::array d_output_histograms,
+ ::cuda::std::array num_output_levels,
+ LowerLevelArrayT lower_level,
+ UpperLevelArrayT upper_level,
+ OffsetT num_row_pixels,
+ OffsetT num_rows,
+ OffsetT row_stride_samples,
+ cudaStream_t stream,
+ ::cuda::std::false_type,
+ KernelSource kernel_source = {},
+ KernelLauncherFactory launcher_factory = {},
+ MaxPolicyT max_policy = {}
+)
+```
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When nullptr, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
+
+
+
+The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of `d_histograms[i]` should be `num_output_levels[i] - 1`.
+
+
+
+The number of bin level boundaries for delineating histogram samples in each active channel. Implies that the number of bins for channeli is `num_output_levels[i] - 1`.
+
+
+
+The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
+
+
+
+The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
+
+
+
+The number of multi-channel pixels per row in the region of interest
+
+
+
+The number of rows in the region of interest
+
+
+
+The number of samples between starts of consecutive rows in the region of interest
+
+
+
+CUDA stream to launch kernels within. Default is stream0.
+
+
+
+
+
+Dispatch routine for HistogramEven with device-side decode operator initialization, specialized for 8-bit sample types (computes 256-bin privatized histograms and then reduces to user-specified levels).
+
+This variant initializes the decode operators inside the kernel from level bounds.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DispatchHistogram::__dispatch_even_device_init(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ SampleIteratorT d_samples,
+ ::cuda::std::array d_output_histograms,
+ ::cuda::std::array num_output_levels,
+ LowerLevelArrayT lower_level,
+ UpperLevelArrayT upper_level,
+ OffsetT num_row_pixels,
+ OffsetT num_rows,
+ OffsetT row_stride_samples,
+ cudaStream_t stream,
+ ::cuda::std::true_type,
+ KernelSource kernel_source = {},
+ KernelLauncherFactory launcher_factory = {},
+ MaxPolicyT max_policy = {}
+)
+```
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When nullptr, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Reference to size in bytes of `d_temp_storage` allocation
+
+
+
+The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
+
+
+
+The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of `d_histograms[i]` should be `num_output_levels[i] - 1`.
+
+
+
+The number of bin level boundaries for delineating histogram samples in each active channel. Implies that the number of bins for channeli is `num_output_levels[i] - 1`.
+
+
+
+The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
+
+
+
+The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
+
+
+
+The number of multi-channel pixels per row in the region of interest
+
+
+
+The number of rows in the region of interest
+
+
+
+The number of samples between starts of consecutive rows in the region of interest
+
+
+
+CUDA stream to launch kernels within. Default is stream0.
+
+
+
+
diff --git a/fern/cudapages/cub/cub/cub/DispatchMergeSort.mdx b/fern/cudapages/cub/cub/cub/DispatchMergeSort.mdx
new file mode 100644
index 0000000..b199c4b
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/DispatchMergeSort.mdx
@@ -0,0 +1,132 @@
+---
+title: cub::DispatchMergeSort
+description: ""
+---
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+
+## Constructors
+
+### DispatchMergeSort inline
+
+
+```cpp showLineNumbers={false}
+cub::DispatchMergeSort::DispatchMergeSort(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ KeyInputIteratorT d_input_keys,
+ ValueInputIteratorT d_input_items,
+ KeyIteratorT d_output_keys,
+ ValueIteratorT d_output_items,
+ OffsetT num_items,
+ CompareOpT compare_op,
+ cudaStream_t stream,
+ int ptx_version,
+ KernelSource kernel_source = {},
+ KernelLauncherFactory launcher_factory = {}
+)
+```
+
+
+---
+
+## Methods
+
+### Invoke inline
+
+
+```cpp showLineNumbers={false}
+template
+cudaError_t cub::DispatchMergeSort::Invoke(
+ ActivePolicyT policy = {}
+)
+```
+
+
+---
+
+## Static methods
+
+### Dispatch inline static
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DispatchMergeSort::Dispatch(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ KeyInputIteratorT d_input_keys,
+ ValueInputIteratorT d_input_items,
+ KeyIteratorT d_output_keys,
+ ValueIteratorT d_output_items,
+ OffsetT num_items,
+ CompareOpT compare_op,
+ cudaStream_t stream,
+ KernelSource kernel_source = {},
+ KernelLauncherFactory launcher_factory = {},
+ MaxPolicyT max_policy = {}
+)
+```
+
+
+---
+
+## Member variables
+
+| Name | Type | Description |
+|---|---|---|
+| `KEYS_ONLY` static constexpr | `bool` | Whether or not there are values to be trucked along with keys. |
+| `d_temp_storage` | `void *` | Device-accessible allocation of temporary storage. |
+| `temp_storage_bytes` | `size_t &` | Reference to size in bytes of `d_temp_storage` allocation. |
+| `d_input_keys` | `KeyInputIteratorT` | Pointer to the input sequence of unsorted input keys. |
+| `d_input_items` | `ValueInputIteratorT` | Pointer to the input sequence of unsorted input values. |
+| `d_output_keys` | `KeyIteratorT` | Pointer to the output sequence of sorted input keys. |
+| `d_output_items` | `ValueIteratorT` | Pointer to the output sequence of sorted input values. |
+| `num_items` | `OffsetT` | Number of items to sort. |
+| `compare_op` | `CompareOpT` | Comparison function object which returns true if the first argument is ordered before the second. |
+| `stream` | `cudaStream_t` | CUDA stream to launch kernels within. Default is stream0. |
+| `ptx_version` | `int` | |
+| `kernel_source` | `KernelSource` | |
+| `launcher_factory` | `KernelLauncherFactory` | |
diff --git a/fern/cudapages/cub/cub/cub/DispatchRadixSort.mdx b/fern/cudapages/cub/cub/cub/DispatchRadixSort.mdx
new file mode 100644
index 0000000..de4ac38
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/DispatchRadixSort.mdx
@@ -0,0 +1,377 @@
+---
+title: cub::DispatchRadixSort
+description: "Utility class for dispatching the appropriately-tuned kernels for device-wide radix sort."
+---
+
+Utility class for dispatching the appropriately-tuned kernels for device-wide radix sort.
+
+
+
+
+
+
+
+
+Key type
+
+
+
+Value type
+
+
+
+Signed integer type for global offsets
+
+
+
+Implementation detail, do not specify directly, requirements on the content of this type are subject to breaking change.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+
+## Constructors
+
+### DispatchRadixSort inline
+
+
+```cpp showLineNumbers={false}
+cub::DispatchRadixSort::DispatchRadixSort(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ DoubleBuffer &d_keys,
+ DoubleBuffer &d_values,
+ OffsetT num_items,
+ int begin_bit,
+ int end_bit,
+ bool is_overwrite_okay,
+ cudaStream_t stream,
+ int ptx_version,
+ DecomposerT decomposer = {},
+ KernelSource kernel_source = {},
+ KernelLauncherFactory launcher_factory = {}
+)
+```
+
+
+---
+
+## Methods
+
+### InvokeSingleTile inline
+
+Invoke a single block to sort in-core.
+
+
+```cpp showLineNumbers={false}
+template
+cudaError_t cub::DispatchRadixSort::InvokeSingleTile(
+ SingleTileKernelT single_tile_kernel,
+ ActivePolicyT policy = {}
+)
+```
+
+
+**Template parameters**
+
+
+Umbrella policy active for the target device
+
+
+
+Function type of cub::DeviceRadixSortSingleTileKernel
+
+
+**Parameters**
+
+
+Kernel function pointer to parameterization of cub::DeviceRadixSortSingleTileKernel
+
+
+### InvokePass inline
+
+Invoke a three-kernel sorting pass at the current bit.
+
+
+```cpp showLineNumbers={false}
+template
+cudaError_t cub::DispatchRadixSort::InvokePass(
+ const KeyT *d_keys_in,
+ KeyT *d_keys_out,
+ const ValueT *d_values_in,
+ ValueT *d_values_out,
+ OffsetT *d_spine,
+ int,
+ int ¤t_bit,
+ PassConfigT &pass_config
+)
+```
+
+
+### InvokeOnesweep inline
+
+
+```cpp showLineNumbers={false}
+template
+cudaError_t cub::DispatchRadixSort::InvokeOnesweep(
+ ActivePolicyT policy = {}
+)
+```
+
+
+### InvokePasses inline
+
+Invocation (run multiple digit passes).
+
+
+```cpp showLineNumbers={false}
+template
+cudaError_t cub::DispatchRadixSort::InvokePasses(
+ UpsweepKernelT upsweep_kernel,
+ UpsweepKernelT alt_upsweep_kernel,
+ ScanKernelT scan_kernel,
+ DownsweepKernelT downsweep_kernel,
+ DownsweepKernelT alt_downsweep_kernel,
+ ActivePolicyT policy = {}
+)
+```
+
+
+**Template parameters**
+
+
+Umbrella policy active for the target device
+
+
+
+Function type of cub::DeviceRadixSortUpsweepKernel
+
+
+
+Function type of cub::SpineScanKernel
+
+
+
+Function type of cub::DeviceRadixSortDownsweepKernel
+
+
+**Parameters**
+
+
+Kernel function pointer to parameterization of cub::DeviceRadixSortUpsweepKernel
+
+
+
+Alternate kernel function pointer to parameterization of cub::DeviceRadixSortUpsweepKernel
+
+
+
+Kernel function pointer to parameterization of cub::SpineScanKernel
+
+
+
+Kernel function pointer to parameterization of cub::DeviceRadixSortDownsweepKernel
+
+
+
+Alternate kernel function pointer to parameterization of cub::DeviceRadixSortDownsweepKernel
+
+
+### InvokeCopy inline
+
+
+```cpp showLineNumbers={false}
+cudaError_t cub::DispatchRadixSort::InvokeCopy()
+```
+
+
+### Invoke inline
+
+Invocation.
+
+
+```cpp showLineNumbers={false}
+template
+cudaError_t cub::DispatchRadixSort::Invoke(
+ ActivePolicyT = {}
+)
+```
+
+
+### __invoke inline
+
+
+```cpp showLineNumbers={false}
+template
+cudaError_t cub::DispatchRadixSort::__invoke(
+ PolicyGetter policy_getter
+)
+```
+
+
+### __invoke_single_tile inline
+
+
+```cpp showLineNumbers={false}
+template
+cudaError_t cub::DispatchRadixSort::__invoke_single_tile(
+ SingleTileKernelT single_tile_kernel,
+ detail::radix_sort::radix_sort_downsweep_policy policy
+)
+```
+
+
+### __invoke_onesweep inline
+
+
+```cpp showLineNumbers={false}
+cudaError_t cub::DispatchRadixSort::__invoke_onesweep(
+ detail::radix_sort::radix_sort_policy policy
+)
+```
+
+
+### __invoke_passes inline
+
+
+```cpp showLineNumbers={false}
+template
+cudaError_t cub::DispatchRadixSort::__invoke_passes(
+ UpsweepKernelT upsweep_kernel,
+ UpsweepKernelT alt_upsweep_kernel,
+ ScanKernelT scan_kernel,
+ DownsweepKernelT downsweep_kernel,
+ DownsweepKernelT alt_downsweep_kernel,
+ const detail::radix_sort::radix_sort_policy &policy
+)
+```
+
+
+---
+
+## Static methods
+
+### Dispatch inline static
+
+Internal dispatch routine.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DispatchRadixSort::Dispatch(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ DoubleBuffer &d_keys,
+ DoubleBuffer &d_values,
+ OffsetT num_items,
+ int begin_bit,
+ int end_bit,
+ bool is_overwrite_okay,
+ cudaStream_t stream,
+ DecomposerT decomposer = {},
+ KernelSource kernel_source = {},
+ KernelLauncherFactory launcher_factory = {},
+ MaxPolicyT max_policy = {}
+)
+```
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When nullptr, the required allocation size is written to [`temp_storage_bytes`](/library/api/cub::_dispatch_radix_sort::temp_storage_bytes) and no work is done.
+
+
+
+Reference to size in bytes of [`d_temp_storage`](/library/api/cub::_dispatch_radix_sort::d_temp_storage) allocation
+
+
+
+Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+
+
+
+Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
+
+
+
+Number of items to sort
+
+
+
+The beginning (least-significant) bit index needed for key comparison
+
+
+
+The past-the-end (most-significant) bit index needed for key comparison
+
+
+
+Whether is okay to overwrite source buffers
+
+
+
+CUDA stream to launch kernels within. Default is stream0.
+
+
+---
+
+## Member variables
+
+| Name | Type | Description |
+|---|---|---|
+| `KEYS_ONLY` static constexpr | `bool` | |
+| `d_temp_storage` | `void *` | Device-accessible allocation of temporary storage. |
+| `temp_storage_bytes` | `size_t &` | Reference to size in bytes of [`d_temp_storage`](/library/api/cub::_dispatch_radix_sort::d_temp_storage) allocation. |
+| `d_keys` | `DoubleBuffer< KeyT > &` | Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys. |
+| `d_values` | `DoubleBuffer< ValueT > &` | Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values. |
+| `num_items` | `OffsetT` | Number of items to sort. |
+| `begin_bit` | `int` | The beginning (least-significant) bit index needed for key comparison. |
+| `end_bit` | `int` | The past-the-end (most-significant) bit index needed for key comparison. |
+| `stream` | `cudaStream_t` | CUDA stream to launch kernels within. Default is stream0. |
+| `ptx_version` | `int` | PTX version. |
+| `is_overwrite_okay` | `bool` | Whether is okay to overwrite source buffers. |
+| `decomposer` | `DecomposerT` | |
+| `kernel_source` | `KernelSource` | |
+| `launcher_factory` | `KernelLauncherFactory` | |
+
+---
+
+## Inner classes
+
+### PassConfig
+
+
+```cpp showLineNumbers={false}
+struct cub::DispatchRadixSort::PassConfig
+```
+
+
+Pass configuration structure.
+
+| Name | Type | Description |
+|---|---|---|
+| `upsweep_kernel` | `UpsweepKernelT` | |
+| `upsweep_config` | `detail::KernelConfig` | |
+| `scan_kernel` | `ScanKernelT` | |
+| `scan_config` | `detail::KernelConfig` | |
+| `downsweep_kernel` | `DownsweepKernelT` | |
+| `downsweep_config` | `detail::KernelConfig` | |
+| `radix_bits` | `int` | |
+| `radix_digits` | `int` | |
+| `max_downsweep_grid_size` | `int` | |
+| `even_share` | `GridEvenShare< OffsetT >` | |
diff --git a/fern/cudapages/cub/cub/cub/DispatchReduce.mdx b/fern/cudapages/cub/cub/cub/DispatchReduce.mdx
new file mode 100644
index 0000000..37b9d0c
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/DispatchReduce.mdx
@@ -0,0 +1,241 @@
+---
+title: cub::DispatchReduce
+description: "Utility class for dispatching the appropriately-tuned kernels for device-wide reduction."
+---
+
+Utility class for dispatching the appropriately-tuned kernels for device-wide reduction.
+
+
+
+
+
+Random-access input iterator type for reading input items (may be a simple pointer type)
+
+
+
+Output iterator type for recording the reduced aggregate (may be a simple pointer type)
+
+
+
+Signed integer type for global offsets
+
+
+
+Binary reduction functor type having member `auto operator()(const T &a, const U &b)`
+
+
+
+Initial value type
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+
+## Constructors
+
+### DispatchReduce inline
+
+Constructor.
+
+
+```cpp showLineNumbers={false}
+cub::DispatchReduce::DispatchReduce(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ InputIteratorT d_in,
+ OutputIteratorT d_out,
+ OffsetT num_items,
+ ReductionOpT reduction_op,
+ InitT init,
+ cudaStream_t stream,
+ int ptx_version,
+ TransformOpT transform_op = {},
+ KernelSource kernel_source = {},
+ KernelLauncherFactory launcher_factory = {}
+)
+```
+
+
+---
+
+## Methods
+
+### InvokeSingleTile inline
+
+Invoke a single block block to reduce in-core.
+
+
+```cpp showLineNumbers={false}
+template
+cudaError_t cub::DispatchReduce::InvokeSingleTile(
+ SingleTileKernelT single_tile_kernel,
+ ActivePolicyT policy = {}
+)
+```
+
+
+**Template parameters**
+
+
+Umbrella policy active for the target device
+
+
+
+Function type of cub::DeviceReduceSingleTileKernel
+
+
+**Parameters**
+
+
+Kernel function pointer to parameterization of cub::DeviceReduceSingleTileKernel
+
+
+### InvokePasses inline
+
+Invoke two-passes to reduce.
+
+
+```cpp showLineNumbers={false}
+template
+cudaError_t cub::DispatchReduce::InvokePasses(
+ ReduceKernelT reduce_kernel,
+ SingleTileKernelT single_tile_kernel,
+ ActivePolicyT active_policy = {}
+)
+```
+
+
+**Template parameters**
+
+
+Umbrella policy active for the target device
+
+
+
+Function type of cub::DeviceReduceKernel
+
+
+
+Function type of cub::DeviceReduceSingleTileKernel
+
+
+**Parameters**
+
+
+Kernel function pointer to parameterization of cub::DeviceReduceKernel
+
+
+
+Kernel function pointer to parameterization of cub::DeviceReduceSingleTileKernel
+
+
+### Invoke inline
+
+Invocation.
+
+
+```cpp showLineNumbers={false}
+template
+cudaError_t cub::DispatchReduce::Invoke(
+ ActivePolicyT active_policy = {}
+)
+```
+
+
+---
+
+## Static methods
+
+### Dispatch inline static
+
+Internal dispatch routine for computing a device-wide reduction.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DispatchReduce::Dispatch(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ InputIteratorT d_in,
+ OutputIteratorT d_out,
+ OffsetT num_items,
+ ReductionOpT reduction_op,
+ InitT init,
+ cudaStream_t stream,
+ TransformOpT transform_op = {},
+ KernelSource kernel_source = {},
+ KernelLauncherFactory launcher_factory = {},
+ MaxPolicyT max_policy = {}
+)
+```
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to [`temp_storage_bytes`](/library/api/cub::_dispatch_reduce::temp_storage_bytes) and no work is done.
+
+
+
+Reference to size in bytes of [`d_temp_storage`](/library/api/cub::_dispatch_reduce::d_temp_storage) allocation
+
+
+
+Pointer to the input sequence of data items
+
+
+
+Pointer to the output aggregate
+
+
+
+Total number of input items (i.e., length of [`d_in`](/library/api/cub::_dispatch_reduce::d_in))
+
+
+
+Binary reduction functor
+
+
+
+The initial value of the reduction
+
+
+
+**[optional]** CUDA stream to launch kernels within. Default is stream0.
+
+
+---
+
+## Member variables
+
+| Name | Type | Description |
+|---|---|---|
+| `d_temp_storage` | `void *` | Device-accessible allocation of temporary storage. |
+| `temp_storage_bytes` | `size_t &` | Reference to size in bytes of [`d_temp_storage`](/library/api/cub::_dispatch_reduce::d_temp_storage) allocation. |
+| `d_in` | `InputIteratorT` | Pointer to the input sequence of data items. |
+| `d_out` | `OutputIteratorT` | Pointer to the output aggregate. |
+| `num_items` | `OffsetT` | Total number of input items (i.e., length of [`d_in`](/library/api/cub::_dispatch_reduce::d_in)). |
+| `reduction_op` | `ReductionOpT` | Binary reduction functor. |
+| `init` | `InitT` | The initial value of the reduction. |
+| `stream` | `cudaStream_t` | CUDA stream to launch kernels within. Default is stream0. |
+| `ptx_version` | `int` | |
+| `transform_op` | `TransformOpT` | |
+| `kernel_source` | `KernelSource` | |
+| `launcher_factory` | `KernelLauncherFactory` | |
diff --git a/fern/cudapages/cub/cub/cub/DispatchReduceByKey.mdx b/fern/cudapages/cub/cub/cub/DispatchReduceByKey.mdx
new file mode 100644
index 0000000..118fa7f
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/DispatchReduceByKey.mdx
@@ -0,0 +1,210 @@
+---
+title: cub::DispatchReduceByKey
+description: "Utility class for dispatching the appropriately-tuned kernels for DeviceReduceByKey."
+---
+
+Utility class for dispatching the appropriately-tuned kernels for DeviceReduceByKey.
+
+
+
+
+
+Random-access input iterator type for keys
+
+
+
+Random-access output iterator type for keys
+
+
+
+Random-access input iterator type for values
+
+
+
+Random-access output iterator type for values
+
+
+
+Output iterator type for recording number of segments encountered
+
+
+
+KeyT equality operator type
+
+
+
+ValueT reduction operator type
+
+
+
+Signed integer type for global offsets
+
+
+
+
+
+
+Implementation detail, do not specify directly, requirements on the content of this type are subject to breaking change.
+
+
+
+
+
+---
+
+## Constructors
+
+### DispatchReduceByKey inline
+
+
+```cpp showLineNumbers={false}
+cub::DispatchReduceByKey::DispatchReduceByKey(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ KeysInputIteratorT d_keys_in,
+ UniqueOutputIteratorT d_unique_out,
+ ValuesInputIteratorT d_values_in,
+ AggregatesOutputIteratorT d_aggregates_out,
+ NumRunsOutputIteratorT d_num_runs_out,
+ EqualityOpT equality_op,
+ ReductionOpT reduction_op,
+ OffsetT num_items,
+ cudaStream_t stream
+)
+```
+
+
+---
+
+## Methods
+
+### Invoke inline
+
+
+
+
+
+```cpp showLineNumbers={false}
+template
+cudaError_t cub::DispatchReduceByKey::Invoke(
+ ScanInitKernelT init_kernel,
+ ReduceByKeyKernelT reduce_by_key_kernel
+)
+```
+
+
+
+
+
+
+```cpp showLineNumbers={false}
+template
+cudaError_t cub::DispatchReduceByKey::Invoke()
+```
+
+
+
+
+
+---
+
+## Static methods
+
+### Dispatch inline static
+
+Internal dispatch routine.
+
+
+```cpp showLineNumbers={false}
+static cudaError_t cub::DispatchReduceByKey::Dispatch(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ KeysInputIteratorT d_keys_in,
+ UniqueOutputIteratorT d_unique_out,
+ ValuesInputIteratorT d_values_in,
+ AggregatesOutputIteratorT d_aggregates_out,
+ NumRunsOutputIteratorT d_num_runs_out,
+ EqualityOpT equality_op,
+ ReductionOpT reduction_op,
+ OffsetT num_items,
+ cudaStream_t stream
+)
+```
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to [`temp_storage_bytes`](/library/api/cub::_dispatch_reduce_by_key::temp_storage_bytes) and no work is done.
+
+
+
+Reference to size in bytes of [`d_temp_storage`](/library/api/cub::_dispatch_reduce_by_key::d_temp_storage) allocation
+
+
+
+Pointer to the input sequence of keys
+
+
+
+Pointer to the output sequence of unique keys (one key per run)
+
+
+
+Pointer to the input sequence of corresponding values
+
+
+
+Pointer to the output sequence of value aggregates (one aggregate per run)
+
+
+
+Pointer to total number of runs encountered (i.e., the length of d_unique_out)
+
+
+
+KeyT equality operator
+
+
+
+ValueT reduction operator
+
+
+
+Total number of items to select from
+
+
+
+CUDA stream to launch kernels within. Default is stream0.
+
+
+---
+
+## Types
+
+### Typedefs
+
+| Name | Definition |
+|---|---|
+| `ValueInputT` | `cub::detail::it_value_t< ValuesInputIteratorT >` |
+| `streaming_context_t` | `NullType` |
+| `ScanTileStateT` | `ReduceByKeyScanTileState< AccumT, OffsetT >` |
+
+---
+
+## Member variables
+
+| Name | Type | Description |
+|---|---|---|
+| `INIT_KERNEL_THREADS` static constexpr | `int` | |
+| `d_temp_storage` | `void *` | |
+| `temp_storage_bytes` | `size_t &` | |
+| `d_keys_in` | `KeysInputIteratorT` | |
+| `d_unique_out` | `UniqueOutputIteratorT` | |
+| `d_values_in` | `ValuesInputIteratorT` | |
+| `d_aggregates_out` | `AggregatesOutputIteratorT` | |
+| `d_num_runs_out` | `NumRunsOutputIteratorT` | |
+| `equality_op` | `EqualityOpT` | |
+| `reduction_op` | `ReductionOpT` | |
+| `num_items` | `OffsetT` | |
+| `stream` | `cudaStream_t` | |
diff --git a/fern/cudapages/cub/cub/cub/DispatchScan.mdx b/fern/cudapages/cub/cub/cub/DispatchScan.mdx
new file mode 100644
index 0000000..1ecc522
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/DispatchScan.mdx
@@ -0,0 +1,241 @@
+---
+title: cub::DispatchScan
+description: "Utility class for dispatching the appropriately-tuned kernels for [DeviceScan](/library/api/cub::_device_scan)."
+---
+
+Utility class for dispatching the appropriately-tuned kernels for [DeviceScan](/library/api/cub::_device_scan).
+
+
+
+
+
+Random-access input iterator type for reading scan inputs (may be a simple pointer type)
+
+
+
+Random-access output iterator type for writing scan outputs (may be a simple pointer type)
+
+
+
+Binary scan functor type having member `auto operator()(const T &a, const U &b)`
+
+
+
+The init_value element type for ScanOpT (cub::NullType for inclusive scans)
+
+
+
+Unsigned integer type for global offsets
+
+
+
+
+
+
+Enum flag to specify whether to enforce inclusive scan.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+
+## Constructors
+
+### DispatchScan inline
+
+
+```cpp showLineNumbers={false}
+cub::DispatchScan::DispatchScan(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ InputIteratorT d_in,
+ OutputIteratorT d_out,
+ OffsetT num_items,
+ ScanOpT scan_op,
+ InitValueT init_value,
+ cudaStream_t stream,
+ int ptx_version,
+ KernelSource kernel_source = {},
+ KernelLauncherFactory launcher_factory = {}
+)
+```
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to [`temp_storage_bytes`](/library/api/cub::_dispatch_scan::temp_storage_bytes) and no work is done.
+
+
+
+Reference to size in bytes of [`d_temp_storage`](/library/api/cub::_dispatch_scan::d_temp_storage) allocation
+
+
+
+Iterator to the input sequence of data items
+
+
+
+Iterator to the output sequence of data items
+
+
+
+Total number of input items (i.e., the length of [`d_in`](/library/api/cub::_dispatch_scan::d_in))
+
+
+
+Binary scan functor
+
+
+
+Initial value to seed the exclusive scan
+
+
+
+**[optional]** CUDA stream to launch kernels within. Default is stream0.
+
+
+
+Object specifying implementation kernels
+
+
+
+Object to execute implementation kernels on the given stream
+
+
+---
+
+## Methods
+
+### Invoke inline
+
+
+
+
+
+```cpp showLineNumbers={false}
+template
+cudaError_t cub::DispatchScan::Invoke(
+ InitKernelT init_kernel,
+ ScanKernelT scan_kernel,
+ ActivePolicyT policy = {}
+)
+```
+
+
+
+
+
+
+```cpp showLineNumbers={false}
+template
+cudaError_t cub::DispatchScan::Invoke(
+ ActivePolicyT active_policy = {}
+)
+```
+
+
+
+
+
+---
+
+## Static methods
+
+### Dispatch inline static
+
+Internal dispatch routine.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DispatchScan::Dispatch(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ InputIteratorT d_in,
+ OutputIteratorT d_out,
+ ScanOpT scan_op,
+ InitValueT init_value,
+ OffsetT num_items,
+ cudaStream_t stream,
+ KernelSource kernel_source = {},
+ KernelLauncherFactory launcher_factory = {},
+ MaxPolicyT max_policy = {}
+)
+```
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to [`temp_storage_bytes`](/library/api/cub::_dispatch_scan::temp_storage_bytes) and no work is done.
+
+
+
+Reference to size in bytes of [`d_temp_storage`](/library/api/cub::_dispatch_scan::d_temp_storage) allocation
+
+
+
+Iterator to the input sequence of data items
+
+
+
+Iterator to the output sequence of data items
+
+
+
+Binary scan functor
+
+
+
+Initial value to seed the exclusive scan
+
+
+
+Total number of input items (i.e., the length of [`d_in`](/library/api/cub::_dispatch_scan::d_in))
+
+
+
+**[optional]** CUDA stream to launch kernels within. Default is stream0.
+
+
+
+Object specifying implementation kernels
+
+
+
+Object to execute implementation kernels on the given stream
+
+
+
+Struct encoding chain of algorithm tuning policies
+
+
+---
+
+## Member variables
+
+| Name | Type | Description |
+|---|---|---|
+| `INIT_KERNEL_THREADS` static constexpr | `int` | |
+| `d_temp_storage` | `void *` | Device-accessible allocation of temporary storage. |
+| `temp_storage_bytes` | `size_t &` | Reference to size in bytes of `d_temp_storage` allocation. |
+| `d_in` | `InputIteratorT` | Iterator to the input sequence of data items. |
+| `d_out` | `OutputIteratorT` | Iterator to the output sequence of data items. |
+| `scan_op` | `ScanOpT` | Binary scan functor. |
+| `init_value` | `InitValueT` | Initial value to seed the exclusive scan. |
+| `num_items` | `OffsetT` | Total number of input items (i.e., the length of `d_in`). |
+| `stream` | `cudaStream_t` | CUDA stream to launch kernels within. Default is stream0. |
+| `ptx_version` | `int` | |
+| `kernel_source` | `KernelSource` | |
+| `launcher_factory` | `KernelLauncherFactory` | |
diff --git a/fern/cudapages/cub/cub/cub/DispatchScanByKey.mdx b/fern/cudapages/cub/cub/cub/DispatchScanByKey.mdx
new file mode 100644
index 0000000..1a9d52b
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/DispatchScanByKey.mdx
@@ -0,0 +1,242 @@
+---
+title: cub::DispatchScanByKey
+description: "Utility class for dispatching the appropriately-tuned kernels for [DeviceScan](/library/api/cub::_device_scan)."
+---
+
+Utility class for dispatching the appropriately-tuned kernels for [DeviceScan](/library/api/cub::_device_scan).
+
+
+
+
+
+Random-access input iterator type
+
+
+
+Random-access input iterator type
+
+
+
+Random-access output iterator type
+
+
+
+Equality functor type
+
+
+
+Scan functor type
+
+
+
+The init_value element for ScanOpT type (cub::NullType for inclusive scan)
+
+
+
+Unsigned integer type for global offsets
+
+
+
+
+
+
+
+
+
+
+
+---
+
+## Constructors
+
+### DispatchScanByKey inline
+
+
+```cpp showLineNumbers={false}
+cub::DispatchScanByKey::DispatchScanByKey(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ KeysInputIteratorT d_keys_in,
+ ValuesInputIteratorT d_values_in,
+ ValuesOutputIteratorT d_values_out,
+ EqualityOp equality_op,
+ ScanOpT scan_op,
+ InitValueT init_value,
+ OffsetT num_items,
+ cudaStream_t stream,
+ int ptx_version
+)
+```
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to [`temp_storage_bytes`](/library/api/cub::_dispatch_scan_by_key::temp_storage_bytes) and no work is done.
+
+
+
+Reference to size in bytes of [`d_temp_storage`](/library/api/cub::_dispatch_scan_by_key::d_temp_storage) allocation
+
+
+
+Iterator to the input sequence of key items
+
+
+
+Iterator to the input sequence of value items
+
+
+
+Iterator to the input sequence of value items
+
+
+
+Binary equality functor
+
+
+
+Binary scan functor
+
+
+
+Initial value to seed the exclusive scan
+
+
+
+Total number of input items (i.e., the length of `d_in`)
+
+
+
+CUDA stream to launch kernels within.
+
+
+---
+
+## Methods
+
+### Invoke inline
+
+
+
+
+
+```cpp showLineNumbers={false}
+template
+cudaError_t cub::DispatchScanByKey::Invoke(
+ InitKernel init_kernel,
+ ScanKernel scan_kernel
+)
+```
+
+
+
+
+
+
+```cpp showLineNumbers={false}
+template
+cudaError_t cub::DispatchScanByKey::Invoke()
+```
+
+
+
+
+
+---
+
+## Static methods
+
+### Dispatch inline static
+
+Internal dispatch routine.
+
+
+```cpp showLineNumbers={false}
+static cudaError_t cub::DispatchScanByKey::Dispatch(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ KeysInputIteratorT d_keys_in,
+ ValuesInputIteratorT d_values_in,
+ ValuesOutputIteratorT d_values_out,
+ EqualityOp equality_op,
+ ScanOpT scan_op,
+ InitValueT init_value,
+ OffsetT num_items,
+ cudaStream_t stream
+)
+```
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to [`temp_storage_bytes`](/library/api/cub::_dispatch_scan_by_key::temp_storage_bytes) and no work is done.
+
+
+
+Reference to size in bytes of [`d_temp_storage`](/library/api/cub::_dispatch_scan_by_key::d_temp_storage) allocation
+
+
+
+Iterator to the input sequence of key items
+
+
+
+Iterator to the input sequence of value items
+
+
+
+Iterator to the input sequence of value items
+
+
+
+Binary equality functor
+
+
+
+Binary scan functor
+
+
+
+Initial value to seed the exclusive scan
+
+
+
+Total number of input items (i.e., the length of `d_in`)
+
+
+
+CUDA stream to launch kernels within.
+
+
+---
+
+## Types
+
+### Typedefs
+
+| Name | Definition |
+|---|---|
+| `KeyT` | `cub::detail::it_value_t< KeysInputIteratorT >` |
+| `InputT` | `cub::detail::it_value_t< ValuesInputIteratorT >` |
+| `ScanByKeyTileStateT` | `ReduceByKeyScanTileState< AccumT, int >` |
+
+---
+
+## Member variables
+
+| Name | Type | Description |
+|---|---|---|
+| `INIT_KERNEL_THREADS` static constexpr | `int` | |
+| `d_temp_storage` | `void *` | Device-accessible allocation of temporary storage. |
+| `temp_storage_bytes` | `size_t &` | Reference to size in bytes of [`d_temp_storage`](/library/api/cub::_dispatch_scan_by_key::d_temp_storage) allocation. |
+| `d_keys_in` | `KeysInputIteratorT` | Iterator to the input sequence of key items. |
+| `d_values_in` | `ValuesInputIteratorT` | Iterator to the input sequence of value items. |
+| `d_values_out` | `ValuesOutputIteratorT` | Iterator to the input sequence of value items. |
+| `equality_op` | `EqualityOp` | Binary equality functor. |
+| `scan_op` | `ScanOpT` | Binary scan functor. |
+| `init_value` | `InitValueT` | Initial value to seed the exclusive scan. |
+| `num_items` | `OffsetT` | Total number of input items (i.e., the length of `d_in`). |
+| `stream` | `cudaStream_t` | CUDA stream to launch kernels within. |
+| `ptx_version` | `int` | |
diff --git a/fern/cudapages/cub/cub/cub/DispatchSegmentedRadixSort.mdx b/fern/cudapages/cub/cub/cub/DispatchSegmentedRadixSort.mdx
new file mode 100644
index 0000000..97b0686
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/DispatchSegmentedRadixSort.mdx
@@ -0,0 +1,274 @@
+---
+title: cub::DispatchSegmentedRadixSort
+description: "Utility class for dispatching the appropriately-tuned kernels for segmented device-wide radix sort."
+---
+
+Utility class for dispatching the appropriately-tuned kernels for segmented device-wide radix sort.
+
+
+
+
+
+
+
+
+Key type
+
+
+
+Value type
+
+
+
+Random-access input iterator type for reading segment beginning offsets (may be a simple pointer type)
+
+
+
+Random-access input iterator type for reading segment ending offsets (may be a simple pointer type)
+
+
+
+Integer type to index items within a segment
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+
+## Constructors
+
+### DispatchSegmentedRadixSort inline
+
+Constructor.
+
+
+```cpp showLineNumbers={false}
+cub::DispatchSegmentedRadixSort::DispatchSegmentedRadixSort(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ DoubleBuffer &d_keys,
+ DoubleBuffer &d_values,
+ ::cuda::std::int64_t num_items,
+ ::cuda::std::int64_t num_segments,
+ BeginOffsetIteratorT d_begin_offsets,
+ EndOffsetIteratorT d_end_offsets,
+ int begin_bit,
+ int end_bit,
+ bool is_overwrite_okay,
+ cudaStream_t stream,
+ int ptx_version,
+ DecomposerT decomposer = {},
+ KernelSource kernel_source = {},
+ KernelLauncherFactory launcher_factory = {}
+)
+```
+
+
+---
+
+## Methods
+
+### InvokePass inline
+
+Invoke a three-kernel sorting pass at the current bit.
+
+
+```cpp showLineNumbers={false}
+template
+cudaError_t cub::DispatchSegmentedRadixSort::InvokePass(
+ const KeyT *d_keys_in,
+ KeyT *d_keys_out,
+ const ValueT *d_values_in,
+ ValueT *d_values_out,
+ int ¤t_bit,
+ PassConfigT &pass_config
+)
+```
+
+
+### InvokePasses inline
+
+Invocation (run multiple digit passes).
+
+
+```cpp showLineNumbers={false}
+template
+cudaError_t cub::DispatchSegmentedRadixSort::InvokePasses(
+ SegmentedKernelT segmented_kernel,
+ SegmentedKernelT alt_segmented_kernel,
+ ActivePolicyT policy = {}
+)
+```
+
+
+**Template parameters**
+
+
+Umbrella policy active for the target device
+
+
+
+Function type of cub::DeviceSegmentedRadixSortKernel
+
+
+**Parameters**
+
+
+Kernel function pointer to parameterization of cub::DeviceSegmentedRadixSortKernel
+
+
+
+Alternate kernel function pointer to parameterization of cub::DeviceSegmentedRadixSortKernel
+
+
+### Invoke inline
+
+Invocation.
+
+
+```cpp showLineNumbers={false}
+template
+cudaError_t cub::DispatchSegmentedRadixSort::Invoke(
+ ActivePolicyT policy = {}
+)
+```
+
+
+---
+
+## Static methods
+
+### Dispatch inline static
+
+Internal dispatch routine.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DispatchSegmentedRadixSort::Dispatch(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ DoubleBuffer &d_keys,
+ DoubleBuffer &d_values,
+ ::cuda::std::int64_t num_items,
+ ::cuda::std::int64_t num_segments,
+ BeginOffsetIteratorT d_begin_offsets,
+ EndOffsetIteratorT d_end_offsets,
+ int begin_bit,
+ int end_bit,
+ bool is_overwrite_okay,
+ cudaStream_t stream,
+ KernelSource kernel_source = {},
+ KernelLauncherFactory launcher_factory = {},
+ MaxPolicyT max_policy = {}
+)
+```
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When nullptr, the required allocation size is written to [`temp_storage_bytes`](/library/api/cub::_dispatch_segmented_radix_sort::temp_storage_bytes) and no work is done.
+
+
+
+Reference to size in bytes of [`d_temp_storage`](/library/api/cub::_dispatch_segmented_radix_sort::d_temp_storage) allocation
+
+
+
+Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+
+
+
+Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
+
+
+
+Number of items to sort
+
+
+
+The number of segments that comprise the sorting data
+
+
+
+Random-access input iterator to the sequence of beginning offsets of length [`num_segments`](/library/api/cub::_dispatch_segmented_radix_sort::num_segments), such that [`d_begin_offsets`](/library/api/cub::_dispatch_segmented_radix_sort::d_begin_offsets)`[i]` is the first element of the *i*th data segment in `d_keys_*` and `d_values_*`
+
+
+
+Random-access input iterator to the sequence of ending offsets of length [`num_segments`](/library/api/cub::_dispatch_segmented_radix_sort::num_segments), such that [`d_end_offsets`](/library/api/cub::_dispatch_segmented_radix_sort::d_end_offsets)`[i]-1` is the last element of the *i*th data segment in `d_keys_*` and `d_values_*`. If [`d_end_offsets`](/library/api/cub::_dispatch_segmented_radix_sort::d_end_offsets)`[i]-1` <= [`d_begin_offsets`](/library/api/cub::_dispatch_segmented_radix_sort::d_begin_offsets)`[i]`, the *i*th is considered empty.
+
+
+
+The beginning (least-significant) bit index needed for key comparison
+
+
+
+The past-the-end (most-significant) bit index needed for key comparison
+
+
+
+Whether is okay to overwrite source buffers
+
+
+
+CUDA stream to launch kernels within. Default is stream0.
+
+
+---
+
+## Member variables
+
+| Name | Type | Description |
+|---|---|---|
+| `KEYS_ONLY` static constexpr | `bool` | |
+| `d_temp_storage` | `void *` | Device-accessible allocation of temporary storage. |
+| `temp_storage_bytes` | `size_t &` | Reference to size in bytes of [`d_temp_storage`](/library/api/cub::_dispatch_segmented_radix_sort::d_temp_storage) allocation. |
+| `d_keys` | `DoubleBuffer< KeyT > &` | Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys. |
+| `d_values` | `DoubleBuffer< ValueT > &` | Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values. |
+| `num_items` | `::cuda::std::int64_t` | Number of items to sort. |
+| `num_segments` | `::cuda::std::int64_t` | The number of segments that comprise the sorting data. |
+| `d_begin_offsets` | `BeginOffsetIteratorT` | Random-access input iterator to the sequence of beginning offsets of length [`num_segments`](/library/api/cub::_dispatch_segmented_radix_sort::num_segments), such that [`d_begin_offsets`](/library/api/cub::_dispatch_segmented_radix_sort::d_begin_offsets)`[i]` is the first element of the *i*th data segment in `d_keys_*` and `d_values_*`. |
+| `d_end_offsets` | `EndOffsetIteratorT` | Random-access input iterator to the sequence of ending offsets of length [`num_segments`](/library/api/cub::_dispatch_segmented_radix_sort::num_segments), such that [`d_end_offsets`](/library/api/cub::_dispatch_segmented_radix_sort::d_end_offsets)`[i]-1` is the last element of the *i*th data segment in `d_keys_*` and `d_values_*`. |
+| `begin_bit` | `int` | The beginning (least-significant) bit index needed for key comparison. |
+| `end_bit` | `int` | The past-the-end (most-significant) bit index needed for key comparison. |
+| `stream` | `cudaStream_t` | CUDA stream to launch kernels within. Default is stream0. |
+| `ptx_version` | `int` | PTX version. |
+| `is_overwrite_okay` | `bool` | Whether is okay to overwrite source buffers. |
+| `decomposer` | `DecomposerT` | |
+| `kernel_source` | `KernelSource` | |
+| `launcher_factory` | `KernelLauncherFactory` | |
+
+---
+
+## Inner classes
+
+### PassConfig
+
+
+```cpp showLineNumbers={false}
+struct cub::DispatchSegmentedRadixSort::PassConfig
+```
+
+
+[PassConfig](/library/api/cub::DispatchSegmentedRadixSort::PassConfig) data structure.
+
+| Name | Type | Description |
+|---|---|---|
+| `segmented_kernel` | `SegmentedKernelT` | |
+| `segmented_config` | `detail::KernelConfig` | |
+| `radix_bits` | `int` | |
+| `radix_digits` | `int` | |
diff --git a/fern/cudapages/cub/cub/cub/DispatchSegmentedReduce.mdx b/fern/cudapages/cub/cub/cub/DispatchSegmentedReduce.mdx
new file mode 100644
index 0000000..e20d207
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/DispatchSegmentedReduce.mdx
@@ -0,0 +1,218 @@
+---
+title: cub::DispatchSegmentedReduce
+description: "Utility class for dispatching the appropriately-tuned kernels for device-wide reduction."
+---
+
+Utility class for dispatching the appropriately-tuned kernels for device-wide reduction.
+
+
+
+
+
+Random-access input iterator type for reading input items (may be a simple pointer type)
+
+
+
+Output iterator type for recording the reduced aggregate (may be a simple pointer type)
+
+
+
+Random-access input iterator type for reading segment beginning offsets (may be a simple pointer type)
+
+
+
+Random-access input iterator type for reading segment ending offsets (may be a simple pointer type)
+
+
+
+Signed integer type for global offsets
+
+
+
+Binary reduction functor type having member `auto operator()(const T &a, const U &b)`
+
+
+
+Value type
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+
+## Constructors
+
+### DispatchSegmentedReduce inline
+
+Constructor.
+
+
+```cpp showLineNumbers={false}
+cub::DispatchSegmentedReduce::DispatchSegmentedReduce(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ InputIteratorT d_in,
+ OutputIteratorT d_out,
+ ::cuda::std::int64_t num_segments,
+ BeginOffsetIteratorT d_begin_offsets,
+ EndOffsetIteratorT d_end_offsets,
+ ReductionOpT reduction_op,
+ InitT init,
+ cudaStream_t stream,
+ int ptx_version,
+ KernelSource kernel_source = {},
+ KernelLauncherFactory launcher_factory = {}
+)
+```
+
+
+---
+
+## Methods
+
+### InvokePasses inline
+
+Invocation.
+
+
+```cpp showLineNumbers={false}
+template
+cudaError_t cub::DispatchSegmentedReduce::InvokePasses(
+ DeviceSegmentedReduceKernelT segmented_reduce_kernel,
+ ActivePolicyT policy = {}
+)
+```
+
+
+**Template parameters**
+
+
+Umbrella policy active for the target device
+
+
+
+Function type of cub::DeviceSegmentedReduceKernel
+
+
+**Parameters**
+
+
+Kernel function pointer to instantiation of cub::DeviceSegmentedReduceKernel
+
+
+### Invoke inline
+
+Invocation.
+
+
+```cpp showLineNumbers={false}
+template
+cudaError_t cub::DispatchSegmentedReduce::Invoke(
+ ActivePolicyT policy = {}
+)
+```
+
+
+---
+
+## Static methods
+
+### Dispatch inline static
+
+Internal dispatch routine for computing a device-wide reduction.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DispatchSegmentedReduce::Dispatch(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ InputIteratorT d_in,
+ OutputIteratorT d_out,
+ ::cuda::std::int64_t num_segments,
+ BeginOffsetIteratorT d_begin_offsets,
+ EndOffsetIteratorT d_end_offsets,
+ ReductionOpT reduction_op,
+ InitT init,
+ cudaStream_t stream,
+ KernelSource kernel_source = {},
+ KernelLauncherFactory launcher_factory = {},
+ MaxPolicyT max_policy = {}
+)
+```
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to [`temp_storage_bytes`](/library/api/cub::_dispatch_segmented_reduce::temp_storage_bytes) and no work is done.
+
+
+
+Reference to size in bytes of [`d_temp_storage`](/library/api/cub::_dispatch_segmented_reduce::d_temp_storage) allocation
+
+
+
+Pointer to the input sequence of data items
+
+
+
+Pointer to the output aggregate
+
+
+
+The number of segments that comprise the sorting data
+
+
+
+Random-access input iterator to the sequence of beginning offsets of length [`num_segments`](/library/api/cub::_dispatch_segmented_reduce::num_segments), such that [`d_begin_offsets`](/library/api/cub::_dispatch_segmented_reduce::d_begin_offsets)`[i]` is the first element of the *i*th data segment in `d_keys_*` and `d_values_*`
+
+
+
+Random-access input iterator to the sequence of ending offsets of length [`num_segments`](/library/api/cub::_dispatch_segmented_reduce::num_segments), such that [`d_end_offsets`](/library/api/cub::_dispatch_segmented_reduce::d_end_offsets)`[i] - 1` is the last element of the *i*th data segment in `d_keys_*` and `d_values_*`. If [`d_end_offsets`](/library/api/cub::_dispatch_segmented_reduce::d_end_offsets)`[i] - 1 <= `[`d_begin_offsets`](/library/api/cub::_dispatch_segmented_reduce::d_begin_offsets)`[i]`, the *i*th is considered empty.
+
+
+
+Binary reduction functor
+
+
+
+The initial value of the reduction
+
+
+
+**[optional]** CUDA stream to launch kernels within. Default is stream0.
+
+
+---
+
+## Member variables
+
+| Name | Type | Description |
+|---|---|---|
+| `d_temp_storage` | `void *` | Device-accessible allocation of temporary storage. |
+| `temp_storage_bytes` | `size_t &` | Reference to size in bytes of [`d_temp_storage`](/library/api/cub::_dispatch_segmented_reduce::d_temp_storage) allocation. |
+| `d_in` | `InputIteratorT` | Pointer to the input sequence of data items. |
+| `d_out` | `OutputIteratorT` | Pointer to the output aggregate. |
+| `num_segments` | `::cuda::std::int64_t` | The number of segments that comprise the segmented reduction data. |
+| `d_begin_offsets` | `BeginOffsetIteratorT` | Random-access input iterator to the sequence of beginning offsets of length [`num_segments`](/library/api/cub::_dispatch_segmented_reduce::num_segments), such that [`d_begin_offsets`](/library/api/cub::_dispatch_segmented_reduce::d_begin_offsets)`[i]` is the first element of the *i*th data segment in `d_keys_*` and `d_values_*`. |
+| `d_end_offsets` | `EndOffsetIteratorT` | Random-access input iterator to the sequence of ending offsets of length [`num_segments`](/library/api/cub::_dispatch_segmented_reduce::num_segments), such that [`d_end_offsets`](/library/api/cub::_dispatch_segmented_reduce::d_end_offsets)`[i] - 1` is the last element of the *i*th data segment in `d_keys_*` and `d_values_*`. |
+| `reduction_op` | `ReductionOpT` | Binary reduction functor. |
+| `init` | `InitT` | The initial value of the reduction. |
+| `stream` | `cudaStream_t` | CUDA stream to launch kernels within. Default is stream0. |
+| `ptx_version` | `int` | |
+| `kernel_source` | `KernelSource` | |
+| `launcher_factory` | `KernelLauncherFactory` | |
diff --git a/fern/cudapages/cub/cub/cub/DispatchSegmentedSort.mdx b/fern/cudapages/cub/cub/cub/DispatchSegmentedSort.mdx
new file mode 100644
index 0000000..b38c67c
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/DispatchSegmentedSort.mdx
@@ -0,0 +1,190 @@
+---
+title: cub::DispatchSegmentedSort
+description: ""
+---
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+
+## Methods
+
+### Invoke inline
+
+
+```cpp showLineNumbers={false}
+template
+cudaError_t cub::DispatchSegmentedSort::Invoke(
+ ActivePolicyT policy = {}
+)
+```
+
+
+### GetNumPasses inline
+
+
+```cpp showLineNumbers={false}
+int cub::DispatchSegmentedSort::GetNumPasses(
+ int radix_bits
+)
+```
+
+
+### GetFinalSelector inline
+
+
+```cpp showLineNumbers={false}
+int cub::DispatchSegmentedSort::GetFinalSelector(
+ int selector,
+ int radix_bits
+)
+```
+
+
+### GetFinalOutput inline
+
+
+```cpp showLineNumbers={false}
+template
+T * cub::DispatchSegmentedSort::GetFinalOutput(
+ int radix_bits,
+ DoubleBuffer &buffer
+)
+```
+
+
+### SortWithPartitioning inline
+
+
+```cpp showLineNumbers={false}
+template
+cudaError_t cub::DispatchSegmentedSort::SortWithPartitioning(
+ LargeKernelT large_kernel,
+ SmallKernelT small_kernel,
+ size_t three_way_partition_temp_storage_bytes,
+ cub::detail::device_double_buffer &d_keys_double_buffer,
+ cub::detail::device_double_buffer &d_values_double_buffer,
+ typename KernelSource::LargeSegmentsSelectorT &large_segments_selector,
+ typename KernelSource::SmallSegmentsSelectorT &small_segments_selector,
+ cub::detail::temporary_storage::alias &device_partition_temp_storage,
+ cub::detail::temporary_storage::alias &large_and_medium_segments_indices,
+ cub::detail::temporary_storage::alias &small_segments_indices,
+ cub::detail::temporary_storage::alias &group_sizes,
+ WrappedPolicyT wrapped_policy
+)
+```
+
+
+### SortWithoutPartitioning inline
+
+
+```cpp showLineNumbers={false}
+template
+cudaError_t cub::DispatchSegmentedSort::SortWithoutPartitioning(
+ FallbackKernelT fallback_kernel,
+ cub::detail::device_double_buffer &d_keys_double_buffer,
+ cub::detail::device_double_buffer &d_values_double_buffer,
+ WrappedPolicyT wrapped_policy
+)
+```
+
+
+---
+
+## Static methods
+
+### Dispatch inline static
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DispatchSegmentedSort::Dispatch(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ DoubleBuffer &d_keys,
+ DoubleBuffer &d_values,
+ ::cuda::std::int64_t num_items,
+ global_segment_offset_t num_segments,
+ BeginOffsetIteratorT d_begin_offsets,
+ EndOffsetIteratorT d_end_offsets,
+ bool is_overwrite_okay,
+ cudaStream_t stream,
+ KernelSource kernel_source = {},
+ PartitionKernelSource partition_kernel_source = {},
+ KernelLauncherFactory launcher_factory = {},
+ MaxPolicyT max_policy = {},
+ PartitionMaxPolicyT partition_max_policy = {}
+)
+```
+
+
+---
+
+## Types
+
+### Typedefs
+
+| Name | Definition |
+|---|---|
+| `local_segment_index_t` | `detail::segmented_sort::local_segment_index_t` |
+| `global_segment_offset_t` | `detail::segmented_sort::global_segment_offset_t` |
+
+---
+
+## Member variables
+
+| Name | Type | Description |
+|---|---|---|
+| `KEYS_ONLY` static constexpr | `int` | |
+| `num_selected_groups` static constexpr | `size_t` | |
+| `d_temp_storage` | `void *` | Device-accessible allocation of temporary storage. |
+| `temp_storage_bytes` | `size_t &` | Reference to size in bytes of [`d_temp_storage`](/library/api/cub::_dispatch_segmented_sort::d_temp_storage) allocation. |
+| `d_keys` | `DoubleBuffer< KeyT > &` | Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys. |
+| `d_values` | `DoubleBuffer< ValueT > &` | Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values. |
+| `num_items` | `::cuda::std::int64_t` | Number of items to sort. |
+| `num_segments` | `global_segment_offset_t` | The number of segments that comprise the sorting data. |
+| `d_begin_offsets` | `BeginOffsetIteratorT` | Random-access input iterator to the sequence of beginning offsets of length [`num_segments`](/library/api/cub::_dispatch_segmented_sort::num_segments), such that [`d_begin_offsets`](/library/api/cub::_dispatch_segmented_sort::d_begin_offsets)`[i]` is the first element of the *i*th data segment in `d_keys_*` and `d_values_*`. |
+| `d_end_offsets` | `EndOffsetIteratorT` | Random-access input iterator to the sequence of ending offsets of length [`num_segments`](/library/api/cub::_dispatch_segmented_sort::num_segments), such that [`d_end_offsets`](/library/api/cub::_dispatch_segmented_sort::d_end_offsets)`[i]-1` is the last element of the *i*th data segment in `d_keys_*` and `d_values_*`. |
+| `is_overwrite_okay` | `bool` | Whether is okay to overwrite source buffers. |
+| `stream` | `cudaStream_t` | CUDA stream to launch kernels within. |
+| `kernel_source` | `KernelSource` | |
+| `partition_kernel_source` | `PartitionKernelSource` | |
+| `launcher_factory` | `KernelLauncherFactory` | |
+| `partition_max_policy` | `PartitionPolicyHub::MaxPolicy` | |
diff --git a/fern/cudapages/cub/cub/cub/DispatchSelectIf.mdx b/fern/cudapages/cub/cub/cub/DispatchSelectIf.mdx
new file mode 100644
index 0000000..d27187d
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/DispatchSelectIf.mdx
@@ -0,0 +1,248 @@
+---
+title: cub::DispatchSelectIf
+description: "Utility class for dispatching the appropriately-tuned kernels for [DeviceSelect](/library/api/cub::_device_select) and [DevicePartition](/library/api/cub::_device_partition)."
+---
+
+Utility class for dispatching the appropriately-tuned kernels for [DeviceSelect](/library/api/cub::_device_select) and [DevicePartition](/library/api/cub::_device_partition).
+
+
+
+
+
+Random-access input iterator type for reading input items
+
+
+
+Random-access input iterator type for reading selection flags (NullType* if a selection functor or discontinuity flagging is used for selection)
+
+
+
+Random-access output iterator type for writing selected items
+
+
+
+Output iterator type for recording the number of items selected
+
+
+
+Selection operator type (NullType if selection flags or discontinuity flagging is used for selection)
+
+
+
+Equality operator type (NullType if selection functor or selection flags are used for selection)
+
+
+
+Signed integer type for global offsets
+
+
+
+[SelectImpl](/library/api/cub::SelectImpl) indicating whether to partition, just selection or selection where the memory for the input and output may alias each other.
+
+
+
+
+
+
+
+
+---
+
+## Constructors
+
+### DispatchSelectIf inline
+
+
+```cpp showLineNumbers={false}
+cub::DispatchSelectIf::DispatchSelectIf(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ InputIteratorT d_in,
+ FlagsInputIteratorT d_flags,
+ SelectedOutputIteratorT d_selected_out,
+ NumSelectedIteratorT d_num_selected_out,
+ SelectOpT select_op,
+ EqualityOpT equality_op,
+ OffsetT num_items,
+ cudaStream_t stream,
+ int ptx_version
+)
+```
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to [`temp_storage_bytes`](/library/api/cub::_dispatch_select_if::temp_storage_bytes) and no work is done.
+
+
+
+Reference to size in bytes of [`d_temp_storage`](/library/api/cub::_dispatch_select_if::d_temp_storage) allocation
+
+
+
+Pointer to the input sequence of data items
+
+
+
+Pointer to the input sequence of selection flags (if applicable)
+
+
+
+Pointer to the output sequence of selected data items
+
+
+
+Pointer to the total number of items selected (i.e., length of [`d_selected_out`](/library/api/cub::_dispatch_select_if::d_selected_out))
+
+
+
+Selection operator
+
+
+
+Equality operator
+
+
+
+Total number of input items (i.e., length of [`d_in`](/library/api/cub::_dispatch_select_if::d_in))
+
+
+
+CUDA stream to launch kernels within. Default is stream0.
+
+
+---
+
+## Methods
+
+### Invoke inline
+
+
+
+
+Internal dispatch routine for computing a device-wide selection using the specified kernel functions.
+
+
+```cpp showLineNumbers={false}
+template
+cudaError_t cub::DispatchSelectIf::Invoke(
+ ScanInitKernelPtrT scan_init_kernel,
+ SelectIfKernelPtrT select_if_kernel
+)
+```
+
+
+
+
+
+
+```cpp showLineNumbers={false}
+template
+cudaError_t cub::DispatchSelectIf::Invoke()
+```
+
+
+
+
+
+---
+
+## Static methods
+
+### Dispatch inline static
+
+Internal dispatch routine.
+
+
+```cpp showLineNumbers={false}
+static cudaError_t cub::DispatchSelectIf::Dispatch(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ InputIteratorT d_in,
+ FlagsInputIteratorT d_flags,
+ SelectedOutputIteratorT d_selected_out,
+ NumSelectedIteratorT d_num_selected_out,
+ SelectOpT select_op,
+ EqualityOpT equality_op,
+ OffsetT num_items,
+ cudaStream_t stream
+)
+```
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When `nullptr`, the required allocation size is written to [`temp_storage_bytes`](/library/api/cub::_dispatch_select_if::temp_storage_bytes) and no work is done.
+
+
+
+Reference to size in bytes of [`d_temp_storage`](/library/api/cub::_dispatch_select_if::d_temp_storage) allocation
+
+
+
+Pointer to the input sequence of data items
+
+
+
+Pointer to the input sequence of selection flags (if applicable)
+
+
+
+Pointer to the output sequence of selected data items
+
+
+
+Pointer to the total number of items selected (i.e., length of [`d_selected_out`](/library/api/cub::_dispatch_select_if::d_selected_out))
+
+
+
+Selection operator
+
+
+
+Equality operator
+
+
+
+Total number of input items (i.e., length of [`d_in`](/library/api/cub::_dispatch_select_if::d_in))
+
+
+
+CUDA stream to launch kernels within. Default is stream0.
+
+
+---
+
+## Types
+
+### Typedefs
+
+| Name | Definition |
+|---|---|
+| `per_partition_offset_t` | `detail::select::per_partition_offset_t` |
+| `num_total_items_t` | `OffsetT` |
+| `streaming_context_t` | `detail::select::streaming_context_t< num_total_items_t, use_streaming_context >` |
+| `ScanTileStateT` | `ScanTileState< per_partition_offset_t >` |
+
+---
+
+## Member variables
+
+| Name | Type | Description |
+|---|---|---|
+| `is_partitioning_invocation` static constexpr | `bool` | |
+| `use_streaming_context` static constexpr | `bool` | |
+| `INIT_KERNEL_THREADS` static constexpr | `int` | |
+| `d_temp_storage` | `void *` | Device-accessible allocation of temporary storage. |
+| `temp_storage_bytes` | `size_t &` | Reference to size in bytes of [`d_temp_storage`](/library/api/cub::_dispatch_select_if::d_temp_storage) allocation. |
+| `d_in` | `InputIteratorT` | Pointer to the input sequence of data items. |
+| `d_flags` | `FlagsInputIteratorT` | Pointer to the input sequence of selection flags (if applicable). |
+| `d_selected_out` | `SelectedOutputIteratorT` | Pointer to the output sequence of selected data items. |
+| `d_num_selected_out` | `NumSelectedIteratorT` | Pointer to the total number of items selected (i.e., length of [`d_selected_out`](/library/api/cub::_dispatch_select_if::d_selected_out)). |
+| `select_op` | `SelectOpT` | Selection operator. |
+| `equality_op` | `EqualityOpT` | Equality operator. |
+| `num_items` | `OffsetT` | Total number of input items (i.e., length of [`d_in`](/library/api/cub::_dispatch_select_if::d_in)). |
+| `stream` | `cudaStream_t` | CUDA stream to launch kernels within. Default is stream0. |
+| `ptx_version` | `int` | |
diff --git a/fern/cudapages/cub/cub/cub/DispatchThreeWayPartitionIf.mdx b/fern/cudapages/cub/cub/cub/DispatchThreeWayPartitionIf.mdx
new file mode 100644
index 0000000..ca12703
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/DispatchThreeWayPartitionIf.mdx
@@ -0,0 +1,142 @@
+---
+title: cub::DispatchThreeWayPartitionIf
+description: ""
+---
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+
+## Methods
+
+### Invoke inline
+
+
+
+
+
+```cpp showLineNumbers={false}
+template
+cudaError_t cub::DispatchThreeWayPartitionIf::Invoke(
+ ActivePolicyT policy,
+ ScanInitKernelPtrT three_way_partition_init_kernel,
+ SelectIfKernelPtrT three_way_partition_kernel
+)
+```
+
+
+
+
+
+
+```cpp showLineNumbers={false}
+template
+cudaError_t cub::DispatchThreeWayPartitionIf::Invoke(
+ ActivePolicyT active_policy = {}
+)
+```
+
+
+
+
+
+---
+
+## Static methods
+
+### Dispatch inline static
+
+Internal dispatch routine.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DispatchThreeWayPartitionIf::Dispatch(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ InputIteratorT d_in,
+ FirstOutputIteratorT d_first_part_out,
+ SecondOutputIteratorT d_second_part_out,
+ UnselectedOutputIteratorT d_unselected_out,
+ NumSelectedIteratorT d_num_selected_out,
+ SelectFirstPartOp select_first_part_op,
+ SelectSecondPartOp select_second_part_op,
+ OffsetT num_items,
+ cudaStream_t stream,
+ KernelSource kernel_source = {},
+ KernelLauncherFactory launcher_factory = {},
+ MaxPolicyT max_policy = {}
+)
+```
+
+
+---
+
+## Types
+
+### Typedefs
+
+| Name | Definition |
+|---|---|
+| `per_partition_offset_t` | `detail::three_way_partition::per_partition_offset_t` |
+| `streaming_context_t` | `detail::three_way_partition::streaming_context_t< OffsetT >` |
+| `ScanTileStateT` | `detail::three_way_partition::ScanTileStateT` |
+
+---
+
+## Member variables
+
+| Name | Type | Description |
+|---|---|---|
+| `partition_size` static constexpr | `per_partition_offset_t` | |
+| `INIT_KERNEL_THREADS` static constexpr | `int` | |
+| `d_temp_storage` | `void *` | |
+| `temp_storage_bytes` | `size_t &` | |
+| `d_in` | `InputIteratorT` | |
+| `d_first_part_out` | `FirstOutputIteratorT` | |
+| `d_second_part_out` | `SecondOutputIteratorT` | |
+| `d_unselected_out` | `UnselectedOutputIteratorT` | |
+| `d_num_selected_out` | `NumSelectedIteratorT` | |
+| `select_first_part_op` | `SelectFirstPartOp` | |
+| `select_second_part_op` | `SelectSecondPartOp` | |
+| `num_items` | `OffsetT` | |
+| `stream` | `cudaStream_t` | |
+| `kernel_source` | `KernelSource` | |
+| `launcher_factory` | `KernelLauncherFactory` | |
diff --git a/fern/cudapages/cub/cub/cub/DispatchUniqueByKey.mdx b/fern/cudapages/cub/cub/cub/DispatchUniqueByKey.mdx
new file mode 100644
index 0000000..b261138
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/DispatchUniqueByKey.mdx
@@ -0,0 +1,247 @@
+---
+title: cub::DispatchUniqueByKey
+description: "Utility class for dispatching the appropriately-tuned kernels for [DeviceSelect](/library/api/cub::_device_select)."
+---
+
+Utility class for dispatching the appropriately-tuned kernels for [DeviceSelect](/library/api/cub::_device_select).
+
+
+
+
+
+Random-access input iterator type for keys
+
+
+
+Random-access input iterator type for values
+
+
+
+Random-access output iterator type for keys
+
+
+
+Random-access output iterator type for values
+
+
+
+Output iterator type for recording the number of items selected
+
+
+
+Equality operator type
+
+
+
+Signed integer type for global offsets
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+
+## Constructors
+
+### DispatchUniqueByKey inline
+
+
+```cpp showLineNumbers={false}
+cub::DispatchUniqueByKey::DispatchUniqueByKey(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ KeyInputIteratorT d_keys_in,
+ ValueInputIteratorT d_values_in,
+ KeyOutputIteratorT d_keys_out,
+ ValueOutputIteratorT d_values_out,
+ NumSelectedIteratorT d_num_selected_out,
+ EqualityOpT equality_op,
+ OffsetT num_items,
+ cudaStream_t stream,
+ KernelSource kernel_source = {},
+ KernelLauncherFactory launcher_factory = {}
+)
+```
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When nullptr, the required allocation size is written to [`temp_storage_bytes`](/library/api/cub::_dispatch_unique_by_key::temp_storage_bytes) and no work is done.
+
+
+
+Pointer to the input sequence of keys
+
+
+
+Pointer to the input sequence of values
+
+
+
+Pointer to the output sequence of selected data items
+
+
+
+Pointer to the output sequence of selected data items
+
+
+
+Pointer to the total number of items selected (i.e., length of `d_keys_out` or `d_values_out`)
+
+
+
+Equality operator
+
+
+
+Total number of input items (i.e., length of `d_keys_in` or `d_values_in`)
+
+
+
+**[optional]** CUDA stream to launch kernels within. Default is stream0.
+
+
+---
+
+## Methods
+
+### Invoke inline
+
+
+
+
+
+```cpp showLineNumbers={false}
+template
+cudaError_t cub::DispatchUniqueByKey::Invoke(
+ InitKernelT init_kernel,
+ UniqueByKeySweepKernelT sweep_kernel,
+ ActivePolicyT policy = {}
+)
+```
+
+
+
+
+
+
+```cpp showLineNumbers={false}
+template
+cudaError_t cub::DispatchUniqueByKey::Invoke(
+ ActivePolicyT active_policy = {}
+)
+```
+
+
+
+
+
+---
+
+## Static methods
+
+### Dispatch inline static
+
+Internal dispatch routine.
+
+
+```cpp showLineNumbers={false}
+template
+static cudaError_t cub::DispatchUniqueByKey::Dispatch(
+ void *d_temp_storage,
+ size_t &temp_storage_bytes,
+ KeyInputIteratorT d_keys_in,
+ ValueInputIteratorT d_values_in,
+ KeyOutputIteratorT d_keys_out,
+ ValueOutputIteratorT d_values_out,
+ NumSelectedIteratorT d_num_selected_out,
+ EqualityOpT equality_op,
+ OffsetT num_items,
+ cudaStream_t stream,
+ KernelSource kernel_source = {},
+ KernelLauncherFactory launcher_factory = {},
+ MaxPolicyT max_policy = {}
+)
+```
+
+
+**Parameters**
+
+
+Device-accessible allocation of temporary storage. When nullptr, the required allocation size is written to [`temp_storage_bytes`](/library/api/cub::_dispatch_unique_by_key::temp_storage_bytes) and no work is done.
+
+
+
+Reference to size in bytes of [`d_temp_storage`](/library/api/cub::_dispatch_unique_by_key::d_temp_storage) allocation
+
+
+
+Pointer to the input sequence of keys
+
+
+
+Pointer to the input sequence of values
+
+
+
+Pointer to the output sequence of selected data items
+
+
+
+Pointer to the output sequence of selected data items
+
+
+
+Pointer to the total number of items selected (i.e., length of `d_keys_out` or `d_values_out`)
+
+
+
+Equality operator
+
+
+
+Total number of input items (i.e., the length of `d_in`)
+
+
+
+**[optional]** CUDA stream to launch kernels within. Default is stream0.
+
+
+---
+
+## Member variables
+
+| Name | Type | Description |
+|---|---|---|
+| `INIT_KERNEL_THREADS` static constexpr | `int` | |
+| `d_temp_storage` | `void *` | Device-accessible allocation of temporary storage. |
+| `temp_storage_bytes` | `size_t &` | Reference to size in bytes of [`d_temp_storage`](/library/api/cub::_dispatch_unique_by_key::d_temp_storage) allocation. |
+| `d_keys_in` | `KeyInputIteratorT` | Pointer to the input sequence of keys. |
+| `d_values_in` | `ValueInputIteratorT` | Pointer to the input sequence of values. |
+| `d_keys_out` | `KeyOutputIteratorT` | Pointer to the output sequence of selected data items. |
+| `d_values_out` | `ValueOutputIteratorT` | Pointer to the output sequence of selected data items. |
+| `d_num_selected_out` | `NumSelectedIteratorT` | Pointer to the total number of items selected (i.e., length of `d_keys_out` or `d_values_out`). |
+| `equality_op` | `EqualityOpT` | Equality operator. |
+| `num_items` | `OffsetT` | Total number of input items (i.e., length of `d_keys_in` or `d_values_in`). |
+| `stream` | `cudaStream_t` | **[optional]** CUDA stream to launch kernels within. Default is stream0. |
+| `kernel_source` | `KernelSource` | |
+| `launcher_factory` | `KernelLauncherFactory` | |
diff --git a/fern/cudapages/cub/cub/cub/GridEvenShare.mdx b/fern/cudapages/cub/cub/cub/GridEvenShare.mdx
new file mode 100644
index 0000000..00ba382
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/GridEvenShare.mdx
@@ -0,0 +1,164 @@
+---
+title: cub::GridEvenShare
+description: "[GridEvenShare](/library/api/cub::_grid_even_share) is a descriptor utility for distributing input among CUDA thread blocks in an \"even-share\" fashion."
+---
+
+`GridEvenShare` is a descriptor utility for distributing input among CUDA thread blocks in an "even-share" fashion.
+
+Each thread block gets roughly the same number of input tiles.
+
+**Overview**
+
+Each thread block is assigned a consecutive sequence of input tiles. To help preserve alignment and eliminate the overhead of guarded loads for all but the last thread block, to `GridEvenShare` assigns one of three different amounts of work to a given thread block: "big", "normal", or "last". The "big" workloads are one scheduling grain larger than "normal". The "last" work unit for the last thread block may be partially-full if the input is not an even multiple of the scheduling grain size.
+
+Before invoking a child grid, a parent thread will typically construct an instance of `GridEvenShare`. The instance can be passed to child thread blocks which can initialize their per-thread block offsets using [`BlockInit()`](/library/api/cub::_grid_even_share::BlockInit()).
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+
+
+
+
+
+
+
+
+---
+
+## Constructors
+
+### GridEvenShare inline
+
+Constructor.
+
+
+```cpp showLineNumbers={false}
+cub::GridEvenShare::GridEvenShare()
+```
+
+
+---
+
+## Methods
+
+### DispatchInit inline
+
+Dispatch initializer.
+
+To be called prior to kernel launch.
+
+
+```cpp showLineNumbers={false}
+void cub::GridEvenShare::DispatchInit(
+ OffsetT num_items_,
+ int max_grid_size,
+ int tile_items
+)
+```
+
+
+**Parameters**
+
+
+Total number of input items
+
+
+
+Maximum grid size allowable (actual grid size may be less if not warranted by the the number of input items)
+
+
+
+Number of data items per input tile
+
+
+### BlockInit inline
+
+
+
+
+Initializes ranges for the specified thread block index.
+
+Specialized for a "raking" access pattern in which each thread block is assigned a consecutive sequence of input tiles.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::GridEvenShare::BlockInit(
+ int block_id,
+ detail::constant_t
+)
+```
+
+
+
+
+
+Block-initialization, specialized for a "raking" access pattern in which each thread block is assigned a consecutive sequence of input tiles.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::GridEvenShare::BlockInit(
+ int block_id,
+ detail::constant_t
+)
+```
+
+
+
+
+
+Block-initialization, specialized for "strip mining" access pattern in which the input tiles assigned to each thread block are separated by a stride equal to the the extent of the grid.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::GridEvenShare::BlockInit()
+```
+
+
+
+
+
+Block-initialization, specialized for a "raking" access pattern in which each thread block is assigned a consecutive sequence of input tiles.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::GridEvenShare::BlockInit(
+ OffsetT1 block_offset,
+ OffsetT1 block_end
+)
+```
+
+
+**Parameters**
+
+
+Threadblock begin offset (inclusive)
+
+
+
+Threadblock end offset (exclusive)
+
+
+
+
+
+---
+
+## Member variables
+
+| Name | Type | Description |
+|---|---|---|
+| `total_tiles` | `int` | |
+| `big_shares` | `int` | |
+| `big_share_items` | `OffsetT` | |
+| `normal_share_items` | `OffsetT` | |
+| `normal_base_offset` | `OffsetT` | |
+| `num_items` | `OffsetT` | Total number of input items. |
+| `grid_size` | `int` | Grid size in thread blocks. |
+| `block_offset` | `OffsetT` | OffsetT into input marking the beginning of the owning thread block's segment of input tiles. |
+| `block_end` | `OffsetT` | OffsetT into input of marking the end (one-past) of the owning thread block's segment of input tiles. |
+| `block_stride` | `OffsetT` | Stride between input tiles. |
diff --git a/fern/cudapages/cub/cub/cub/GridQueue.mdx b/fern/cudapages/cub/cub/cub/GridQueue.mdx
new file mode 100644
index 0000000..4255986
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/GridQueue.mdx
@@ -0,0 +1,179 @@
+---
+title: cub::GridQueue
+description: "[GridQueue](/library/api/cub::_grid_queue) is a descriptor utility for dynamic queue management."
+---
+
+`GridQueue` is a descriptor utility for dynamic queue management.
+
+**Overview**
+
+`GridQueue` descriptors provides abstractions for "filling" or "draining" globally-shared vectors.
+
+A "filling" `GridQueue` works by atomically-adding to a zero-initialized counter, returning a unique offset for the calling thread to write its items. The `GridQueue` maintains the total "fill-size". The fill counter must be reset using [GridQueue::ResetFill](/library/api/cub::_grid_queue::ResetFill) by the host or kernel instance prior to the kernel instance that will be filling.
+
+Similarly, a "draining" `GridQueue` works by atomically-incrementing a zero-initialized counter, returning a unique offset for the calling thread to read its items. Threads can safely drain until the array's logical fill-size is exceeded. The drain counter must be reset using [GridQueue::ResetDrain](/library/api/cub::_grid_queue::ResetDrain) or [GridQueue::FillAndResetDrain](/library/api/cub::_grid_queue::FillAndResetDrain) by the host or kernel instance prior to the kernel instance that will be filling. (For dynamic work distribution of existing data, the corresponding fill-size is simply the number of elements in the array.)
+
+Iterative work management can be implemented simply with a pair of flip-flopping work buffers, each with an associated set of fill and drain `GridQueue` descriptors.
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+
+
+
+Signed integer type for global offsets
+
+
+
+
+
+---
+
+## Constructors
+
+### GridQueue inline
+
+
+
+
+Constructs an invalid `GridQueue` descriptor.
+
+
+```cpp showLineNumbers={false}
+cub::GridQueue::GridQueue()
+```
+
+
+
+
+
+Constructs a `GridQueue` descriptor around the device storage allocation.
+
+
+```cpp showLineNumbers={false}
+cub::GridQueue::GridQueue(
+ void *d_storage
+)
+```
+
+
+**Parameters**
+
+
+Device allocation to back the `GridQueue`. Must be at least as big as [`AllocationSize()`](/library/api/cub::_grid_queue::AllocationSize()).
+
+
+
+
+
+---
+
+## Methods
+
+### FillAndResetDrain inline
+
+This operation sets the fill-size and resets the drain counter, preparing the `GridQueue` for draining in the next kernel instance.
+
+To be called by the host or by a kernel prior to the one which will be draining.
+
+
+```cpp showLineNumbers={false}
+cudaError_t cub::GridQueue::FillAndResetDrain(
+ OffsetT fill_size,
+ cudaStream_t stream = 0
+)
+```
+
+
+### ResetDrain inline
+
+This operation resets the drain so that it may advance to meet the existing fill-size.
+
+To be called by the host or by a kernel prior to the one which will be draining.
+
+
+```cpp showLineNumbers={false}
+cudaError_t cub::GridQueue::ResetDrain(
+ cudaStream_t stream = 0
+)
+```
+
+
+### ResetFill inline
+
+This operation resets the fill counter.
+
+To be called by the host or by a kernel prior to the one which will be filling.
+
+
+```cpp showLineNumbers={false}
+cudaError_t cub::GridQueue::ResetFill(
+ cudaStream_t stream = 0
+)
+```
+
+
+### FillSize inline
+
+Returns the fill-size established by the parent or by the previous kernel.
+
+
+```cpp showLineNumbers={false}
+cudaError_t cub::GridQueue::FillSize(
+ OffsetT &fill_size,
+ cudaStream_t stream = 0
+)
+```
+
+
+### Drain inline
+
+Drain `num_items` from the queue.
+
+Returns offset from which to read items. To be called from CUDA kernel.
+
+
+```cpp showLineNumbers={false}
+OffsetT cub::GridQueue::Drain(
+ OffsetT num_items
+)
+```
+
+
+### Fill inline
+
+Fill `num_items` into the queue.
+
+Returns offset from which to write items. To be called from CUDA kernel.
+
+
+```cpp showLineNumbers={false}
+OffsetT cub::GridQueue::Fill(
+ OffsetT num_items
+)
+```
+
+
+---
+
+## Static methods
+
+### AllocationSize inline static
+
+Returns the device allocation size in bytes needed to construct a `GridQueue` instance.
+
+
+```cpp showLineNumbers={false}
+static size_t cub::GridQueue::AllocationSize()
+```
+
+
+---
+
+## Member variables
+
+| Name | Type | Description |
+|---|---|---|
+| `FILL` static constexpr | `int` | Counter indices. |
+| `DRAIN` static constexpr | `int` | |
+| `d_counters` | `OffsetT *` | Pair of counters. |
diff --git a/fern/cudapages/cub/cub/cub/InequalityWrapper.mdx b/fern/cudapages/cub/cub/cub/InequalityWrapper.mdx
new file mode 100644
index 0000000..f4e4c6f
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/InequalityWrapper.mdx
@@ -0,0 +1,57 @@
+---
+title: cub::InequalityWrapper
+description: "Inequality functor (wraps equality functor)."
+---
+
+Inequality functor (wraps equality functor).
+
+
+
+
+
+
+
+
+
+
+---
+
+## Constructors
+
+### InequalityWrapper inline
+
+Constructor.
+
+
+```cpp showLineNumbers={false}
+cub::InequalityWrapper::InequalityWrapper(
+ EqualityOp op
+)
+```
+
+
+---
+
+## Methods
+
+### operator() inline
+
+Boolean inequality operator, returns `t != u`.
+
+
+```cpp showLineNumbers={false}
+template
+bool cub::InequalityWrapper::operator()(
+ T &&t,
+ U &&u
+)
+```
+
+
+---
+
+## Member variables
+
+| Name | Type | Description |
+|---|---|---|
+| `op` | `EqualityOp` | Wrapped equality operator. |
diff --git a/fern/cudapages/cub/cub/cub/PtxVersionCacheTag.mdx b/fern/cudapages/cub/cub/cub/PtxVersionCacheTag.mdx
new file mode 100644
index 0000000..a120828
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/PtxVersionCacheTag.mdx
@@ -0,0 +1,4 @@
+---
+title: cub::PtxVersionCacheTag
+description: ""
+---
diff --git a/fern/cudapages/cub/cub/cub/RadixSortTwiddle.mdx b/fern/cudapages/cub/cub/cub/RadixSortTwiddle.mdx
new file mode 100644
index 0000000..acd6b7c
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/RadixSortTwiddle.mdx
@@ -0,0 +1,70 @@
+---
+title: cub::RadixSortTwiddle
+description: "Twiddling keys for radix sort."
+---
+
+Twiddling keys for radix sort.
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+
+## Static methods
+
+### In inline static
+
+
+```cpp showLineNumbers={false}
+template
+static bit_ordered_type cub::RadixSortTwiddle::In(
+ bit_ordered_type key,
+ DecomposerT decomposer = {}
+)
+```
+
+
+### Out inline static
+
+
+```cpp showLineNumbers={false}
+template
+static bit_ordered_type cub::RadixSortTwiddle::Out(
+ bit_ordered_type key,
+ DecomposerT decomposer = {}
+)
+```
+
+
+### DefaultKey inline static
+
+
+```cpp showLineNumbers={false}
+template
+static bit_ordered_type cub::RadixSortTwiddle::DefaultKey(
+ DecomposerT decomposer = {}
+)
+```
+
+
+---
+
+## Types
+
+### Typedefs
+
+| Name | Definition |
+|---|---|
+| `traits` | `detail::radix::traits_t< KeyT >` |
+| `bit_ordered_type` | `typename traits::bit_ordered_type` |
+| `bit_ordered_conversion_policy` | `typename traits::bit_ordered_conversion_policy` |
+| `bit_ordered_inversion_policy` | `typename traits::bit_ordered_inversion_policy` |
diff --git a/fern/cudapages/cub/cub/cub/ReduceByKeyOp.mdx b/fern/cudapages/cub/cub/cub/ReduceByKeyOp.mdx
new file mode 100644
index 0000000..85c6bce
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/ReduceByKeyOp.mdx
@@ -0,0 +1,83 @@
+---
+title: cub::ReduceByKeyOp
+description: ""
+---
+
+
+
+
+
+Binary reduction operator to apply to values
+
+
+
+
+
+---
+
+## Constructors
+
+### ReduceByKeyOp inline
+
+
+
+
+Constructor.
+
+
+```cpp showLineNumbers={false}
+cub::ReduceByKeyOp::ReduceByKeyOp()
+```
+
+
+
+
+
+Constructor.
+
+
+```cpp showLineNumbers={false}
+cub::ReduceByKeyOp::ReduceByKeyOp(
+ ReductionOpT op
+)
+```
+
+
+
+
+
+---
+
+## Methods
+
+### operator() inline
+
+Scan operator.
+
+
+```cpp showLineNumbers={false}
+template
+KeyValuePairT cub::ReduceByKeyOp::operator()(
+ const KeyValuePairT &first,
+ const KeyValuePairT &second
+)
+```
+
+
+**Parameters**
+
+
+First partial reduction
+
+
+
+Second partial reduction
+
+
+---
+
+## Member variables
+
+| Name | Type | Description |
+|---|---|---|
+| `op` | `ReductionOpT` | Wrapped reduction operator. |
diff --git a/fern/cudapages/cub/cub/cub/ReduceByKeyScanTileState.mdx b/fern/cudapages/cub/cub/cub/ReduceByKeyScanTileState.mdx
new file mode 100644
index 0000000..02c057b
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/ReduceByKeyScanTileState.mdx
@@ -0,0 +1,21 @@
+---
+title: cub::ReduceByKeyScanTileState
+description: "Tile status interface for reduction by key."
+---
+
+Tile status interface for reduction by key.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/fern/cudapages/cub/cub/cub/ReduceByKeyScanTileState_ValueT_KeyT_false.mdx b/fern/cudapages/cub/cub/cub/ReduceByKeyScanTileState_ValueT_KeyT_false.mdx
new file mode 100644
index 0000000..c83313c
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/ReduceByKeyScanTileState_ValueT_KeyT_false.mdx
@@ -0,0 +1,44 @@
+---
+title: "cub::ReduceByKeyScanTileState< ValueT, KeyT, false >"
+description: "Tile status interface for reduction by key, specialized for scan status and value types that cannot be combined into one machine word."
+---
+
+Tile status interface for reduction by key, specialized for scan status and value types that cannot be combined into one machine word.
+
+
+
+
+
+
+
+
+
+
+
+
+
+**Inherits from:** `cub::ScanTileState< KeyValuePair< KeyT, ValueT > >` (public)
+
+---
+
+## Methods
+
+### ReduceByKeyScanTileState inline
+
+Constructor.
+
+": "/library/api/cub::ReduceByKeyScanTileState%3C ValueT, KeyT, false %3E"}}>
+```cpp showLineNumbers={false}
+cub::ReduceByKeyScanTileState::ReduceByKeyScanTileState()
+```
+
+
+---
+
+## Types
+
+### Typedefs
+
+| Name | Definition |
+|---|---|
+| `SuperClass` | `ScanTileState< KeyValuePair< KeyT, ValueT > >` |
diff --git a/fern/cudapages/cub/cub/cub/ReduceBySegmentOp.mdx b/fern/cudapages/cub/cub/cub/ReduceBySegmentOp.mdx
new file mode 100644
index 0000000..9fa4308
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/ReduceBySegmentOp.mdx
@@ -0,0 +1,95 @@
+---
+title: cub::ReduceBySegmentOp
+description: "Reduce-by-segment functor."
+---
+
+Reduce-by-segment functor.
+
+Given two cub::KeyValuePair inputs `a` and `b` and a binary associative combining operator `f(const T &x, const T &y)`, an instance of this functor returns a cub::KeyValuePair whose `key` field is `a.key + b.key`, and whose `value` field is either `b.value` if `b.key` is non-zero, or `f(a.value, b.value)` otherwise.
+
+`ReduceBySegmentOp` is an associative, non-commutative binary combining operator for input sequences of cub::KeyValuePair pairings. Such sequences are typically used to represent a segmented set of values to be reduced and a corresponding set of {0,1}-valued integer "head flags" demarcating the first value of each segment.
+
+
+
+
+
+Binary reduction operator to apply to values
+
+
+
+
+
+---
+
+## Constructors
+
+### ReduceBySegmentOp inline
+
+
+
+
+Constructor.
+
+
+```cpp showLineNumbers={false}
+cub::ReduceBySegmentOp::ReduceBySegmentOp()
+```
+
+
+
+
+
+Constructor.
+
+
+```cpp showLineNumbers={false}
+cub::ReduceBySegmentOp::ReduceBySegmentOp(
+ ReductionOpT op
+)
+```
+
+
+
+
+
+---
+
+## Methods
+
+### operator() inline
+
+Scan operator.
+
+
+```cpp showLineNumbers={false}
+template
+KeyValuePairT cub::ReduceBySegmentOp::operator()(
+ const KeyValuePairT &first,
+ const KeyValuePairT &second
+)
+```
+
+
+**Template parameters**
+
+
+KeyValuePair pairing of T (value) and OffsetT (head flag)
+
+
+**Parameters**
+
+
+First partial reduction
+
+
+
+Second partial reduction
+
+
+---
+
+## Member variables
+
+| Name | Type | Description |
+|---|---|---|
+| `op` | `ReductionOpT` | Wrapped reduction operator. |
diff --git a/fern/cudapages/cub/cub/cub/ScanTileState.mdx b/fern/cudapages/cub/cub/cub/ScanTileState.mdx
new file mode 100644
index 0000000..2811669
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/ScanTileState.mdx
@@ -0,0 +1,18 @@
+---
+title: cub::ScanTileState
+description: "Tile status interface."
+---
+
+Tile status interface.
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/fern/cudapages/cub/cub/cub/ScanTileState_T_false.mdx b/fern/cudapages/cub/cub/cub/ScanTileState_T_false.mdx
new file mode 100644
index 0000000..6c21c9e
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/ScanTileState_T_false.mdx
@@ -0,0 +1,180 @@
+---
+title: "cub::ScanTileState< T, false >"
+description: "Tile status interface specialized for scan status and value types that can be combined into one machine word that can be read/written coherently in a single access."
+---
+
+Tile status interface specialized for scan status and value types that can be combined into one machine word that can be read/written coherently in a single access.
+
+Tile status interface specialized for scan status and value types that cannot be combined into one machine word.
+
+
+
+
+
+
+
+
+
+
+---
+
+## Methods
+
+### ScanTileState inline
+
+Constructor.
+
+": "/library/api/cub::ScanTileState%3C T, false %3E"}}>
+```cpp showLineNumbers={false}
+cub::ScanTileState::ScanTileState()
+```
+
+
+### Init inline
+
+Initializer.
+
+": "/library/api/cub::ScanTileState%3C T, false %3E"}}>
+```cpp showLineNumbers={false}
+cudaError_t cub::ScanTileState::Init(
+ int num_tiles,
+ void *d_temp_storage,
+ size_t temp_storage_bytes
+)
+```
+
+
+**Parameters**
+
+
+Number of tiles
+
+
+
+Device-accessible allocation of temporary storage. When nullptr, the required allocation size is written to `temp_storage_bytes` and no work is done.
+
+
+
+Size in bytes of `d_temp_storage` allocation Initializer
+
+
+### InitializeStatus inline
+
+Initialize (from device).
+
+": "/library/api/cub::ScanTileState%3C T, false %3E"}}>
+```cpp showLineNumbers={false}
+void cub::ScanTileState::InitializeStatus(
+ int num_tiles
+)
+```
+
+
+### SetInclusive inline
+
+Update the specified tile's inclusive value and corresponding status.
+
+": "/library/api/cub::ScanTileState%3C T, false %3E"}}>
+```cpp showLineNumbers={false}
+template
+void cub::ScanTileState::SetInclusive(
+ int tile_idx,
+ T tile_inclusive
+)
+```
+
+
+### SetPartial inline
+
+Update the specified tile's partial value and corresponding status.
+
+": "/library/api/cub::ScanTileState%3C T, false %3E"}}>
+```cpp showLineNumbers={false}
+template
+void cub::ScanTileState::SetPartial(
+ int tile_idx,
+ T tile_partial
+)
+```
+
+
+### WaitForValid inline
+
+Wait for the corresponding tile to become non-invalid.
+
+": "/library/api/cub::ScanTileState%3C T, false %3E"}}>
+```cpp showLineNumbers={false}
+template
+void cub::ScanTileState::WaitForValid(
+ int tile_idx,
+ StatusWord &status,
+ T &value,
+ DelayT delay = {}
+)
+```
+
+
+### LoadValid inline
+
+Loads and returns the tile's value.
+
+The returned value is undefined if either (a) the tile's status is invalid or (b) there is no memory fence between reading a non-invalid status and the call to LoadValid.
+
+": "/library/api/cub::ScanTileState%3C T, false %3E"}}>
+```cpp showLineNumbers={false}
+T cub::ScanTileState::LoadValid(
+ int tile_idx
+)
+```
+
+
+---
+
+## Static methods
+
+### AllocationSize inline static constexpr
+
+Compute device memory needed for tile status.
+
+": "/library/api/cub::ScanTileState%3C T, false %3E"}}>
+```cpp showLineNumbers={false}
+static constexpr cudaError_t cub::ScanTileState::AllocationSize(
+ int num_tiles,
+ size_t &temp_storage_bytes
+)
+```
+
+
+**Parameters**
+
+
+Number of tiles
+
+
+
+Size in bytes of `d_temp_storage` allocation
+
+
+---
+
+## Types
+
+### Typedefs
+
+| Name | Definition |
+|---|---|
+| `StatusValueT` | `T` |
+| `StatusWord` | `unsigned int` |
+
+---
+
+## Member variables
+
+| Name | Type | Description |
+|---|---|---|
+| `TILE_STATUS_PADDING` static constexpr | `int` | |
+| `description_bytes_per_tile` static constexpr | `size_t` | |
+| `payload_bytes_per_tile` static constexpr | `size_t` | |
+| `d_tile_status` | `StatusWord *` | |
+| `d_tile_partial` | `T *` | |
+| `d_tile_inclusive` | `T *` | |
diff --git a/fern/cudapages/cub/cub/cub/ShiftDigitExtractor.mdx b/fern/cudapages/cub/cub/cub/ShiftDigitExtractor.mdx
new file mode 100644
index 0000000..02ebe18
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/ShiftDigitExtractor.mdx
@@ -0,0 +1,82 @@
+---
+title: cub::ShiftDigitExtractor
+description: "A wrapper type to extract digits."
+---
+
+A wrapper type to extract digits.
+
+Uses a combination of shift and bitwise and to extract digits.
+
+
+
+
+
+
+
+
+
+
+**Inherits from:** `cub::BaseDigitExtractor< KeyT >` (public)
+
+---
+
+## Constructors
+
+### ShiftDigitExtractor inline explicit
+
+
+```cpp showLineNumbers={false}
+cub::ShiftDigitExtractor::ShiftDigitExtractor(
+ ::cuda::std::uint32_t bit_start = 0,
+ ::cuda::std::uint32_t num_bits = 0
+)
+```
+
+
+---
+
+## Methods
+
+### Digit inline const
+
+
+```cpp showLineNumbers={false}
+::cuda::std::uint32_t cub::ShiftDigitExtractor::Digit(
+ UnsignedBits key
+) const
+```
+
+
+---
+
+## Static methods
+
+### ProcessFloatMinusZero inline static
+
+
+```cpp showLineNumbers={false}
+static UnsignedBits cub::BaseDigitExtractor::ProcessFloatMinusZero(
+ UnsignedBits key
+)
+```
+
+
+---
+
+## Types
+
+### Typedefs
+
+| Name | Definition |
+|---|---|
+| `TraitsT` | `Traits< KeyT >` |
+| `UnsignedBits` | `typename TraitsT::UnsignedBits` |
+
+---
+
+## Member variables
+
+| Name | Type | Description |
+|---|---|---|
+| `bit_start` | `::cuda::std::uint32_t` | |
+| `mask` | `::cuda::std::uint32_t` | |
diff --git a/fern/cudapages/cub/cub/cub/SmVersionCacheTag.mdx b/fern/cudapages/cub/cub/cub/SmVersionCacheTag.mdx
new file mode 100644
index 0000000..54e358f
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/SmVersionCacheTag.mdx
@@ -0,0 +1,4 @@
+---
+title: cub::SmVersionCacheTag
+description: ""
+---
diff --git a/fern/cudapages/cub/cub/cub/SwizzleScanOp.mdx b/fern/cudapages/cub/cub/cub/SwizzleScanOp.mdx
new file mode 100644
index 0000000..017892f
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/SwizzleScanOp.mdx
@@ -0,0 +1,57 @@
+---
+title: cub::SwizzleScanOp
+description: "Binary operator wrapper for switching non-commutative scan arguments."
+---
+
+Binary operator wrapper for switching non-commutative scan arguments.
+
+
+
+
+
+
+
+
+
+
+---
+
+## Constructors
+
+### SwizzleScanOp inline
+
+Constructor.
+
+
+```cpp showLineNumbers={false}
+cub::SwizzleScanOp::SwizzleScanOp(
+ ScanOp scan_op
+)
+```
+
+
+---
+
+## Methods
+
+### operator() inline
+
+Switch the scan arguments.
+
+
+```cpp showLineNumbers={false}
+template
+T cub::SwizzleScanOp::operator()(
+ const T &a,
+ const T &b
+)
+```
+
+
+---
+
+## Member variables
+
+| Name | Type | Description |
+|---|---|---|
+| `scan_op` | `ScanOp` | Wrapped scan operator. |
diff --git a/fern/cudapages/cub/cub/cub/TilePrefixCallbackOp.mdx b/fern/cudapages/cub/cub/cub/TilePrefixCallbackOp.mdx
new file mode 100644
index 0000000..c76d316
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/TilePrefixCallbackOp.mdx
@@ -0,0 +1,192 @@
+---
+title: cub::TilePrefixCallbackOp
+description: "Tile status interface for reduction by key, specialized for scan status and value types that can be combined into one machine word that can be read/written coherently in a single access."
+---
+
+Tile status interface for reduction by key, specialized for scan status and value types that can be combined into one machine word that can be read/written coherently in a single access.
+
+Stateful block-scan prefix functor. Provides the running prefix for the current tile by using the callback warp to wait for aggregates/prefixes from predecessor tiles to become available.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Implementation detail, do not specify directly, requirements on the content of this type are subject to breaking change.
+
+
+
+
+
+---
+
+## Constructors
+
+### TilePrefixCallbackOp inline
+
+
+
+
+
+```cpp showLineNumbers={false}
+cub::TilePrefixCallbackOp::TilePrefixCallbackOp(
+ ScanTileStateT &tile_status,
+ TempStorage &temp_storage,
+ ScanOpT scan_op,
+ int tile_idx
+)
+```
+
+
+
+
+
+
+```cpp showLineNumbers={false}
+cub::TilePrefixCallbackOp::TilePrefixCallbackOp(
+ ScanTileStateT &tile_status,
+ TempStorage &temp_storage,
+ ScanOpT scan_op
+)
+```
+
+
+
+
+
+---
+
+## Methods
+
+### ProcessWindow inline
+
+Block until all predecessors within the warp-wide window have non-invalid status.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::TilePrefixCallbackOp::ProcessWindow(
+ int predecessor_idx,
+ StatusWord &predecessor_status,
+ T &window_aggregate,
+ DelayT delay = {}
+)
+```
+
+
+**Parameters**
+
+
+Preceding tile index to inspect
+
+
+
+Preceding tile status
+
+
+
+Relevant partial reduction from this window of preceding tiles
+
+
+### operator() inline
+
+
+```cpp showLineNumbers={false}
+T cub::TilePrefixCallbackOp::operator()(
+ T block_aggregate
+)
+```
+
+
+### GetExclusivePrefix inline
+
+
+```cpp showLineNumbers={false}
+T cub::TilePrefixCallbackOp::GetExclusivePrefix()
+```
+
+
+### GetInclusivePrefix inline
+
+
+```cpp showLineNumbers={false}
+T cub::TilePrefixCallbackOp::GetInclusivePrefix()
+```
+
+
+### GetBlockAggregate inline
+
+
+```cpp showLineNumbers={false}
+T cub::TilePrefixCallbackOp::GetBlockAggregate()
+```
+
+
+### GetTileIdx inline const
+
+
+```cpp showLineNumbers={false}
+int cub::TilePrefixCallbackOp::GetTileIdx() const
+```
+
+
+---
+
+## Types
+
+### Typedefs
+
+| Name | Definition |
+|---|---|
+| `WarpReduceT` | `WarpReduce< T,(1<<(5))>` |
+| `StatusWord` | `typename ScanTileStateT::StatusWord` |
+
+---
+
+## Member variables
+
+| Name | Type | Description |
+|---|---|---|
+| `temp_storage` | `_TempStorage &` | Reference to a warp-reduction instance. |
+| `tile_status` | `ScanTileStateT &` | Interface to tile status. |
+| `scan_op` | `ScanOpT` | Binary scan operator. |
+| `tile_idx` | `int` | The current tile index. |
+| `exclusive_prefix` | `T` | Exclusive prefix for the tile. |
+| `inclusive_prefix` | `T` | Inclusive prefix for the tile. |
+
+---
+
+## Inner classes
+
+### _TempStorage
+
+
+```cpp showLineNumbers={false}
+struct cub::TilePrefixCallbackOp::_TempStorage
+```
+
+
+| Name | Type | Description |
+|---|---|---|
+| `warp_reduce` | `WarpReduceT::TempStorage` | |
+| `exclusive_prefix` | `T` | |
+| `inclusive_prefix` | `T` | |
+| `block_aggregate` | `T` | |
+
+### TempStorage
+
+
+```cpp showLineNumbers={false}
+struct cub::TilePrefixCallbackOp::TempStorage
+```
+
+
+**Inherits from:** `Uninitialized< _TempStorage >` (public)
diff --git a/fern/cudapages/cub/cub/cub/WarpExchange.mdx b/fern/cudapages/cub/cub/cub/WarpExchange.mdx
new file mode 100644
index 0000000..430279d
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/WarpExchange.mdx
@@ -0,0 +1,244 @@
+---
+title: cub::WarpExchange
+description: "The [WarpExchange](/library/api/cub::_warp_exchange) class provides [collective](../index.html#sec0) methods for rearranging data partitioned across a CUDA warp."
+---
+
+The `WarpExchange` class provides [collective](../index.html#sec0) methods for rearranging data partitioned across a CUDA warp.
+
+**Overview**
+
+- It is commonplace for a warp of threads to rearrange data items between threads. For example, the global memory accesses prefer patterns where data items are "striped" across threads (where consecutive threads access consecutive items), yet most warp-wide operations prefer a "blocked" partitioning of items across threads (where consecutive items belong to a single thread).
+- `WarpExchange` supports the following types of data exchanges:
+Transposing between blocked and striped arrangements
+Scattering ranked items to a striped arrangement
+ - Transposing between [blocked](../index.html#sec5sec3) and [striped](../index.html#sec5sec3) arrangements
+ - Scattering ranked items to a [striped arrangement](../index.html#sec5sec3)
+
+**A Simple Example**
+
+
+The code snippet below illustrates the conversion from a "blocked" to a "striped" arrangement of 64 integer items partitioned across 16 threads where each thread owns 4 items.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+__global__ void ExampleKernel(int *d_data, ...)
+{
+ constexpr int warp_threads = 16;
+ constexpr int block_threads = 256;
+ constexpr int items_per_thread = 4;
+ constexpr int warps_per_block = block_threads / warp_threads;
+ const int warp_id = static_cast(threadIdx.x) / warp_threads;
+
+ // Specialize WarpExchange for a virtual warp of 16 threads owning 4 integer items each
+ using WarpExchangeT =
+ cub::WarpExchange;
+
+ // Allocate shared memory for WarpExchange
+ __shared__ typename WarpExchangeT::TempStorage temp_storage[warps_per_block];
+
+ // Load a tile of data striped across threads
+ int thread_data[items_per_thread];
+ // ...
+
+ // Collectively exchange data into a blocked arrangement across threads
+ WarpExchangeT(temp_storage[warp_id]).StripedToBlocked(thread_data, thread_data);
+```
+
+Suppose the set of striped input `thread_data` across the block of threads is `{ [0,16,32,48], [1,17,33,49], ..., [15, 32, 47, 63] }`. The corresponding output `thread_data` in those threads will be `{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [60,61,62,63] }`.
+
+
+
+
+
+
+
+
+The number of items partitioned onto each thread.
+
+
+
+**[optional]** The number of threads per "logical" warp (may be less than the number of hardware warp threads). Default is the warp size of the targeted CUDA compute-capability (e.g., 32 threads for SM86). Must be a power of two.
+
+
+
+
+
+
+
+
+**Inherits from:** `detail::InternalWarpExchangeImpl< InputT, ITEMS_PER_THREAD, detail::warp_threads, WARP_EXCHANGE_SMEM >` (private)
+
+---
+
+## Collective constructors
+
+### WarpExchange inline explicit
+
+
+
+
+Collective constructor using the specified memory allocation as temporary storage.
+
+
+```cpp showLineNumbers={false}
+cub::WarpExchange::WarpExchange(
+ TempStorage &temp_storage
+)
+```
+
+
+
+
+
+
+```cpp showLineNumbers={false}
+cub::WarpExchange::WarpExchange() = delete
+```
+
+
+
+
+
+---
+
+## Data movement
+
+### BlockedToStriped inline
+
+Transposes data items from *blocked* arrangement to *striped* arrangement.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::WarpExchange::BlockedToStriped(
+ const InputT (&input_items)[ITEMS_PER_THREAD],
+ OutputT (&output_items)[ITEMS_PER_THREAD]
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+**Parameters**
+
+
+Items to exchange, converting between *blocked* and *striped* arrangements.
+
+
+
+Items from exchange, converting between *striped* and *blocked* arrangements. May be aliased to `input_items`.
+
+
+### StripedToBlocked inline
+
+Transposes data items from *striped* arrangement to *blocked* arrangement.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::WarpExchange::StripedToBlocked(
+ const InputT (&input_items)[ITEMS_PER_THREAD],
+ OutputT (&output_items)[ITEMS_PER_THREAD]
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+**Parameters**
+
+
+Items to exchange
+
+
+
+Items from exchange. May be aliased to `input_items`.
+
+
+### ScatterToStriped inline
+
+
+
+
+Exchanges valid data items annotated by rank into *striped* arrangement.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::WarpExchange::ScatterToStriped(
+ InputT (&items)[ITEMS_PER_THREAD],
+ OffsetT (&ranks)[ITEMS_PER_THREAD]
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+**Template parameters**
+
+
+**[inferred]** Signed integer type for local offsets
+
+
+**Parameters**
+
+
+Items to exchange
+
+
+
+Corresponding scatter ranks
+
+
+
+
+
+Exchanges valid data items annotated by rank into *striped* arrangement.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::WarpExchange::ScatterToStriped(
+ const InputT (&input_items)[ITEMS_PER_THREAD],
+ OutputT (&output_items)[ITEMS_PER_THREAD],
+ OffsetT (&ranks)[ITEMS_PER_THREAD]
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+**Template parameters**
+
+
+**[inferred]** Signed integer type for local offsets
+
+
+**Parameters**
+
+
+Items to exchange
+
+
+
+Items from exchange. May be aliased to `input_items`.
+
+
+
+Corresponding scatter ranks
+
+
+
+
+
+---
+
+## Types
+
+### Typedefs
+
+| Name | Definition | Description |
+|---|---|---|
+| `InternalWarpExchange` | `detail::InternalWarpExchangeImpl< InputT, ITEMS_PER_THREAD, LOGICAL_WARP_THREADS, WARP_EXCHANGE_ALGORITHM >` | |
+| `TempStorage` | `typename InternalWarpExchange::TempStorage` | The operations exposed by `WarpExchange` require a temporary memory allocation of this nested type for thread communication. This opaque storage can be allocated directly using the `__shared__` keyword. Alternatively, it can be aliased to externally allocated memory (shared or global) or `union`'d with other storage allocation types to facilitate memory reuse. |
diff --git a/fern/cudapages/cub/cub/cub/WarpLoad.mdx b/fern/cudapages/cub/cub/cub/WarpLoad.mdx
new file mode 100644
index 0000000..350cd84
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/WarpLoad.mdx
@@ -0,0 +1,421 @@
+---
+title: cub::WarpLoad
+description: ""
+---
+
+The WarpLoad class provides collective data movement methods for loading a linear segment of items from memory into a blocked arrangement across a CUDA thread warp.
+
+## Example
+
+The code snippet below illustrates the loading of a linear segment of 64 integers into a "blocked" arrangement across 16 threads where each thread owns 4 consecutive items. The load is specialized for `WARP_LOAD_TRANSPOSE`, meaning memory references are efficiently coalesced using a warp-striped access pattern (after which items are locally reordered among threads).
+
+The set of `thread_data` across the first logical warp of threads in those threads will be: `{ [0,1,2,3], [4,5,6,7], ..., [60,61,62,63] }`.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+__global__ void ExampleKernel(int *d_data, ...)
+{
+ constexpr int warp_threads = 16;
+ constexpr int block_threads = 256;
+ constexpr int items_per_thread = 4;
+
+ // Specialize WarpLoad for a warp of 16 threads owning 4 integer items each
+ using WarpLoadT = WarpLoad;
+
+ constexpr int warps_in_block = block_threads / warp_threads;
+ constexpr int tile_size = items_per_thread * warp_threads;
+ const int warp_id = static_cast(threadIdx.x) / warp_threads;
+
+ // Allocate shared memory for WarpLoad
+ __shared__ typename WarpLoadT::TempStorage temp_storage[warps_in_block];
+
+ // Load a segment of consecutive items that are blocked across threads
+ int thread_data[items_per_thread];
+ WarpLoadT(temp_storage[warp_id]).Load(d_data + warp_id * tile_size, thread_data);
+}
+```
+
+
+
+
+
+The data type to read into (which must be convertible from the input iterator's value type).
+
+
+
+The number of consecutive items partitioned onto each thread.
+
+
+
+**[optional]** [cub::WarpLoadAlgorithm](/library/api/cub::WarpLoadAlgorithm) tuning policy. default: [cub::WARP_LOAD_DIRECT](/library/api/cub::WARP_LOAD_DIRECT).
+
+
+
+**[optional]** The number of threads per "logical" warp (may be less than the number of hardware warp threads). Default is the warp size of the targeted CUDA compute-capability (e.g., 32 threads for SM86). Must be a power of two.
+
+
+
+
+
+---
+
+## Collective constructors
+
+### WarpLoad inline
+
+
+
+
+Collective constructor using a private static allocation of shared memory as temporary storage.
+
+
+```cpp showLineNumbers={false}
+cub::WarpLoad::WarpLoad()
+```
+
+
+
+
+
+Collective constructor using the specified memory allocation as temporary storage.
+
+
+```cpp showLineNumbers={false}
+cub::WarpLoad::WarpLoad(
+ TempStorage &temp_storage
+)
+```
+
+
+
+
+
+---
+
+## Data movement
+
+### Load inline
+
+
+
+
+Load a linear segment of items from memory.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::WarpLoad::Load(
+ InputIteratorT block_itr,
+ InputT (&items)[ITEMS_PER_THREAD]
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Parameters**
+
+
+The thread block's base input iterator for loading from
+
+
+
+Data to load
+
+
+**Example**
+
+The set of `thread_data` across the first logical warp of threads in those threads will be: `{ [0,1,2,3], [4,5,6,7], ..., [60,61,62,63] }`.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+__global__ void ExampleKernel(int *d_data, ...)
+{
+ constexpr int warp_threads = 16;
+ constexpr int block_threads = 256;
+ constexpr int items_per_thread = 4;
+
+ // Specialize WarpLoad for a warp of 16 threads owning 4 integer items each
+ using WarpLoadT = WarpLoad;
+
+ constexpr int warps_in_block = block_threads / warp_threads;
+ constexpr int tile_size = items_per_thread * warp_threads;
+ const int warp_id = static_cast(threadIdx.x) / warp_threads;
+
+ // Allocate shared memory for WarpLoad
+ __shared__ typename WarpLoadT::TempStorage temp_storage[warps_in_block];
+
+ // Load a segment of consecutive items that are blocked across threads
+ int thread_data[items_per_thread];
+ WarpLoadT(temp_storage[warp_id]).Load(d_data + warp_id * tile_size, thread_data);
+}
+```
+
+
+
+
+Load a linear segment of items from memory, guarded by range.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::WarpLoad::Load(
+ InputIteratorT block_itr,
+ InputT (&items)[ITEMS_PER_THREAD],
+ int valid_items
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Parameters**
+
+
+The thread block's base input iterator for loading from
+
+
+
+Data to load
+
+
+
+Number of valid items to load
+
+
+**Example**
+
+The set of `thread_data` across the first logical warp of threads in those threads will be: `{ [0,1,2,3], [4,?,?,?], ..., [?,?,?,?] }` with only the first two threads being unmasked to load portions of valid data (and other items remaining unassigned).
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+__global__ void ExampleKernel(int *d_data, int valid_items, ...)
+{
+ constexpr int warp_threads = 16;
+ constexpr int block_threads = 256;
+ constexpr int items_per_thread = 4;
+
+ // Specialize WarpLoad for a warp of 16 threads owning 4 integer items each
+ using WarpLoadT = WarpLoad;
+
+ constexpr int warps_in_block = block_threads / warp_threads;
+ constexpr int tile_size = items_per_thread * warp_threads;
+ const int warp_id = static_cast(threadIdx.x) / warp_threads;
+
+ // Allocate shared memory for WarpLoad
+ __shared__ typename WarpLoadT::TempStorage temp_storage[warps_in_block];
+
+ // Load a segment of consecutive items that are blocked across threads
+ int thread_data[items_per_thread];
+ WarpLoadT(temp_storage[warp_id]).Load(d_data + warp_id * tile_size, thread_data,
+ valid_items);
+}
+```
+
+
+
+
+Load a linear segment of items from memory, guarded by range.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::WarpLoad::Load(
+ InputIteratorT block_itr,
+ InputT (&items)[ITEMS_PER_THREAD],
+ int valid_items,
+ DefaultT oob_default
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Parameters**
+
+
+The thread block's base input iterator for loading from
+
+
+
+Data to load
+
+
+
+Number of valid items to load
+
+
+
+Default value to assign out-of-bound items
+
+
+**Example**
+
+out-of-bounds default is `-1`. The set of `thread_data` across the first logical warp of threads in those threads will be: `{ [0,1,2,3], [4,-1,-1,-1], ..., [-1,-1,-1,-1] }` with only the first two threads being unmasked to load portions of valid data (and other items are assigned `-1`).
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+__global__ void ExampleKernel(int *d_data, int valid_items, ...)
+{
+ constexpr int warp_threads = 16;
+ constexpr int block_threads = 256;
+ constexpr int items_per_thread = 4;
+
+ // Specialize WarpLoad for a warp of 16 threads owning 4 integer items each
+ using WarpLoadT = WarpLoad;
+
+ constexpr int warps_in_block = block_threads / warp_threads;
+ constexpr int tile_size = items_per_thread * warp_threads;
+ const int warp_id = static_cast(threadIdx.x) / warp_threads;
+
+ // Allocate shared memory for WarpLoad
+ __shared__ typename WarpLoadT::TempStorage temp_storage[warps_in_block];
+
+ // Load a segment of consecutive items that are blocked across threads
+ int thread_data[items_per_thread];
+ WarpLoadT(temp_storage[warp_id]).Load(d_data + warp_id * tile_size,
+ thread_data,
+ valid_items,
+ -1);
+```
+
+
+
+
+---
+
+## Utility methods
+
+### PrivateStorage inline
+
+Internal storage allocator.
+
+
+```cpp showLineNumbers={false}
+_TempStorage & cub::WarpLoad::PrivateStorage()
+```
+
+
+---
+
+## Types
+
+### Typedefs
+
+| Name | Definition | Description |
+|---|---|---|
+| `InternalLoad` | `LoadInternal< ALGORITHM, 0 >` | Internal load implementation to use. |
+| `_TempStorage` | `typename InternalLoad::TempStorage` | Shared memory storage layout type. |
+
+---
+
+## Member variables
+
+| Name | Type | Description |
+|---|---|---|
+| `IS_ARCH_WARP` static constexpr | `bool` | |
+| `temp_storage` | `_TempStorage &` | Thread reference to shared storage. |
+| `linear_tid` | `int` | Linear thread-id. |
+
+---
+
+## Inner classes
+
+### LoadInternal
+
+
+```cpp showLineNumbers={false}
+struct cub::WarpLoad::LoadInternal
+```
+
+
+Load helper.
+
+### LoadInternal< WARP_LOAD_DIRECT, DUMMY >
+
+
+```cpp showLineNumbers={false}
+struct cub::WarpLoad::LoadInternal< WARP_LOAD_DIRECT, DUMMY >
+```
+
+
+| Name | Type | Description |
+|---|---|---|
+| `linear_tid` | `int` | |
+
+### LoadInternal< WARP_LOAD_STRIPED, DUMMY >
+
+
+```cpp showLineNumbers={false}
+struct cub::WarpLoad::LoadInternal< WARP_LOAD_STRIPED, DUMMY >
+```
+
+
+| Name | Type | Description |
+|---|---|---|
+| `linear_tid` | `int` | |
+
+### LoadInternal< WARP_LOAD_VECTORIZE, DUMMY >
+
+
+```cpp showLineNumbers={false}
+struct cub::WarpLoad::LoadInternal< WARP_LOAD_VECTORIZE, DUMMY >
+```
+
+
+| Name | Type | Description |
+|---|---|---|
+| `linear_tid` | `int` | |
+
+### LoadInternal< WARP_LOAD_TRANSPOSE, DUMMY >
+
+
+```cpp showLineNumbers={false}
+struct cub::WarpLoad::LoadInternal< WARP_LOAD_TRANSPOSE, DUMMY >
+```
+
+
+| Name | Type | Description |
+|---|---|---|
+| `temp_storage` | `_TempStorage &` | |
+| `linear_tid` | `int` | |
+
+### TempStorage
+
+
+```cpp showLineNumbers={false}
+struct cub::WarpLoad::TempStorage
+```
+
+
+The operations exposed by `WarpLoad` require a temporary memory allocation of this nested type for thread communication. This opaque storage can be allocated directly using the `__shared__` keyword. Alternatively, it can be aliased to externally allocated memory (shared or global) or `union`'d with other storage allocation types to facilitate memory reuse.
+
+**Inherits from:** `Uninitialized< _TempStorage >` (public)
diff --git a/fern/cudapages/cub/cub/cub/WarpMergeSort.mdx b/fern/cudapages/cub/cub/cub/WarpMergeSort.mdx
new file mode 100644
index 0000000..891f13f
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/WarpMergeSort.mdx
@@ -0,0 +1,145 @@
+---
+title: cub::WarpMergeSort
+description: ""
+---
+
+The WarpMergeSort class provides methods for sorting items partitioned across a CUDA warp using a merge sorting method.
+
+## Example
+
+The code snippet below illustrates a sort of 64 integer keys that are partitioned across 16 threads where each thread owns 4 consecutive items.
+
+`{ [0,64,1,63], [2,62,3,61], [4,60,5,59], ..., [31,34,32,33] }`. The corresponding output `thread_keys` in those threads will be `{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [31,32,33,34] }`.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+struct CustomLess
+{
+ template
+ __device__ bool operator()(const DataType &lhs, const DataType &rhs)
+ {
+ return lhs < rhs;
+ }
+};
+
+__global__ void ExampleKernel(...)
+{
+ constexpr int warp_threads = 16;
+ constexpr int block_threads = 256;
+ constexpr int items_per_thread = 4;
+ constexpr int warps_per_block = block_threads / warp_threads;
+ const int warp_id = static_cast(threadIdx.x) / warp_threads;
+
+ // Specialize WarpMergeSort for a virtual warp of 16 threads
+ // owning 4 integer items each
+ using WarpMergeSortT =
+ cub::WarpMergeSort;
+
+ // Allocate shared memory for WarpMergeSort
+ __shared__ typename WarpMergeSortT::TempStorage temp_storage[warps_per_block];
+
+ // Obtain a segment of consecutive items that are blocked across threads
+ int thread_keys[items_per_thread];
+ // ...
+
+ WarpMergeSortT(temp_storage[warp_id]).Sort(thread_keys, CustomLess());
+ // ...
+}
+```
+
+
+
+
+
+Key type
+
+
+
+The number of items per thread
+
+
+
+**[optional]** The number of threads per "logical" warp (may be less than the number of hardware warp threads). Default is the warp size of the targeted CUDA compute-capability (e.g., 32 threads for SM86). Must be a power of two.
+
+
+
+**[optional]** Value type (default: cub::NullType, which indicates a keys-only sort)
+
+
+
+
+
+**Inherits from:** `cub::BlockMergeSortStrategy< KeyT, NullType, detail::warp_threads, ITEMS_PER_THREAD, WarpMergeSort< KeyT, ITEMS_PER_THREAD, detail::warp_threads, NullType > >` (public)
+
+---
+
+## Constructors
+
+### WarpMergeSort inline
+
+
+
+
+
+```cpp showLineNumbers={false}
+cub::WarpMergeSort::WarpMergeSort(
+ typename BlockMergeSortStrategyT::TempStorage &temp_storage
+)
+```
+
+
+
+
+
+
+```cpp showLineNumbers={false}
+cub::WarpMergeSort::WarpMergeSort() = delete
+```
+
+
+
+
+
+---
+
+## Methods
+
+### get_member_mask inline const
+
+
+```cpp showLineNumbers={false}
+unsigned int cub::WarpMergeSort::get_member_mask() const
+```
+
+
+### SyncImplementation inline const
+
+
+```cpp showLineNumbers={false}
+void cub::WarpMergeSort::SyncImplementation() const
+```
+
+
+---
+
+## Types
+
+### Typedefs
+
+| Name | Definition |
+|---|---|
+| `BlockMergeSortStrategyT` | `BlockMergeSortStrategy< KeyT, ValueT, LOGICAL_WARP_THREADS, ITEMS_PER_THREAD, WarpMergeSort >` |
+
+---
+
+## Member variables
+
+| Name | Type | Description |
+|---|---|---|
+| `IS_ARCH_WARP` static constexpr | `bool` | |
+| `KEYS_ONLY` static constexpr | `bool` | |
+| `TILE_SIZE` static constexpr | `int` | |
+| `warp_id` | `const unsigned int` | |
+| `member_mask` | `const unsigned int` | |
+| `BlockMergeSortStrategyT` | `friend` | |
diff --git a/fern/cudapages/cub/cub/cub/WarpReduce.mdx b/fern/cudapages/cub/cub/cub/WarpReduce.mdx
new file mode 100644
index 0000000..a2ca6b8
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/WarpReduce.mdx
@@ -0,0 +1,749 @@
+---
+title: cub::WarpReduce
+description: ""
+---
+
+The `WarpReduce` class provides collective methods for computing a parallel reduction of items partitioned across a CUDA thread warp.
+
+
+
+## Performance considerations
+
+- Uses special instructions when applicable (e.g., warp `SHFL` instructions)
+- Uses synchronization-free communication between warp lanes when applicable
+- Incurs zero bank conflicts for most types
+- Computation is slightly more efficient (i.e., having lower instruction overhead) for:
+
+ - Summation (**vs.** generic reduction)
+ - The architecture's warp size is a whole multiple of `LogicalWarpThreads`
+
+## Example
+
+The code snippet below illustrates four concurrent warp sum reductions within a block of 128 threads (one per each of the 32-thread warps).
+
+The corresponding output `aggregate` in threads 0, 32, 64, and 96 will be `496`, `1520`, `2544`, and `3568`, respectively (and is undefined in other threads).
+
+The code snippet below illustrates a single warp sum reduction within a block of 128 threads.
+
+The corresponding output `aggregate` in thread0 will be `496` (and is undefined in other threads).
+
+```cpp showLineNumbers={false}
+#include
+
+__global__ void ExampleKernel(...)
+{
+ // Specialize WarpReduce for type int
+ using WarpReduce = cub::WarpReduce;
+ // Allocate WarpReduce shared memory for 4 warps
+ __shared__ typename WarpReduce::TempStorage temp_storage[4];
+ // Obtain one input item per thread
+ int thread_data = ...
+ // Return the warp-wide sums to each lane0 (threads 0, 32, 64, and 96)
+ int warp_id = threadIdx.x / 32;
+ int aggregate = WarpReduce(temp_storage[warp_id]).Sum(thread_data);
+}
+
+#include
+
+__global__ void ExampleKernel(...)
+{
+ // Specialize WarpReduce for type int
+ using WarpReduce = cub::WarpReduce;
+ // Allocate WarpReduce shared memory for one warp
+ __shared__ typename WarpReduce::TempStorage temp_storage;
+ ...
+ // Only the first warp performs a reduction
+ if (threadIdx.x < 32)
+ {
+ // Obtain one input item per thread
+ int thread_data = ...
+ // Return the warp-wide sum to lane0
+ int aggregate = WarpReduce(temp_storage).Sum(thread_data);
+ }
+}
+```
+
+
+
+
+
+The reduction input/output element type
+
+
+
+**[optional]** The number of threads per "logical" warp (may be less than the number of hardware warp threads). Default is the warp size of the targeted CUDA compute-capability (e.g., 32 threads for SM20).
+
+
+
+
+
+---
+
+## Collective constructors
+
+### WarpReduce inline
+
+Collective constructor using the specified memory allocation as temporary storage. Logical warp and lane identifiers are constructed from `threadIdx.x`.
+
+
+```cpp showLineNumbers={false}
+cub::WarpReduce::WarpReduce(
+ TempStorage &temp_storage
+)
+```
+
+
+**Parameters**
+
+
+Reference to memory allocation having layout type [TempStorage](/library/api/cub::WarpReduce::TempStorage)
+
+
+---
+
+## Summation reductions
+
+### Sum inline nodiscard
+
+
+
+
+Computes a warp-wide sum in the calling warp. The output is valid in warp *lane*0.
+
+
+```cpp showLineNumbers={false}
+T cub::WarpReduce::Sum(
+ T input
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Example**
+
+The code snippet below illustrates four concurrent warp sum reductions within a block of 128 threads (one per each of the 32-thread warps).
+
+The corresponding output `aggregate` in threads 0, 32, 64, and 96 will `496`, `1520`, `2544`, and `3568`, respectively (and is undefined in other threads).
+
+```cpp showLineNumbers={false}
+#include
+
+__global__ void ExampleKernel(...)
+{
+ // Specialize WarpReduce for type int
+ using WarpReduce = cub::WarpReduce;
+ // Allocate WarpReduce shared memory for 4 warps
+ __shared__ typename WarpReduce::TempStorage temp_storage[4];
+ // Obtain one input item per thread
+ int thread_data = ...
+ // Return the warp-wide sums to each lane0
+ int warp_id = threadIdx.x / 32;
+ int aggregate = WarpReduce(temp_storage[warp_id]).Sum(thread_data);
+}
+```
+
+
+
+
+
+```cpp showLineNumbers={false}
+template
+T cub::WarpReduce::Sum(
+ const InputType &input
+)
+```
+
+
+
+
+
+Computes a partially-full warp-wide sum in the calling warp. The output is valid in warp *lane*0.
+
+All threads across the calling warp must agree on the same value for `valid_items`. Otherwise the result is undefined.
+
+
+```cpp showLineNumbers={false}
+T cub::WarpReduce::Sum(
+ T input,
+ int valid_items
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Parameters**
+
+
+Calling thread's input
+
+
+
+Total number of valid items in the calling thread's logical warp (may be less than `LogicalWarpThreads`)
+
+
+**Example**
+
+The code snippet below illustrates a sum reduction within a single, partially-full block of 32 threads (one warp).
+
+The corresponding output `aggregate` in *lane*0 is `6` (and is undefined in other threads).
+
+```cpp showLineNumbers={false}
+#include
+
+__global__ void ExampleKernel(int *d_data, int valid_items)
+{
+ // Specialize WarpReduce for type int
+ using WarpReduce = cub::WarpReduce;
+
+ // Allocate WarpReduce shared memory for one warp
+ __shared__ typename WarpReduce::TempStorage temp_storage;
+
+ // Obtain one input item per thread if in range
+ int thread_data;
+ if (threadIdx.x < valid_items)
+ thread_data = d_data[threadIdx.x];
+
+ // Return the warp-wide sums to each lane0
+ int aggregate = WarpReduce(temp_storage).Sum(thread_data, valid_items);
+}
+```
+
+
+
+
+### Max inline nodiscard
+
+
+
+
+
+```cpp showLineNumbers={false}
+T cub::WarpReduce::Max(
+ T input
+)
+```
+
+
+
+
+
+
+```cpp showLineNumbers={false}
+template
+T cub::WarpReduce::Max(
+ const InputType &input
+)
+```
+
+
+
+
+
+
+```cpp showLineNumbers={false}
+T cub::WarpReduce::Max(
+ T input,
+ int valid_items
+)
+```
+
+
+
+
+
+### Min inline nodiscard
+
+
+
+
+
+```cpp showLineNumbers={false}
+T cub::WarpReduce::Min(
+ T input
+)
+```
+
+
+
+
+
+
+```cpp showLineNumbers={false}
+template
+T cub::WarpReduce::Min(
+ const InputType &input
+)
+```
+
+
+
+
+
+
+```cpp showLineNumbers={false}
+T cub::WarpReduce::Min(
+ T input,
+ int valid_items
+)
+```
+
+
+
+
+
+### HeadSegmentedSum inline nodiscard
+
+Computes a segmented sum in the calling warp where segments are defined by head-flags. The sum of each segment is returned to the first lane in that segment (which always includes *lane*0).
+
+
+```cpp showLineNumbers={false}
+template
+T cub::WarpReduce::HeadSegmentedSum(
+ T input,
+ FlagT head_flag
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Parameters**
+
+
+Calling thread's input
+
+
+
+Head flag denoting whether or not `input` is the start of a new segment
+
+
+**Example**
+
+The code snippet below illustrates a head-segmented warp sum reduction within a block of 32 threads (one warp).
+
+is `{0, 1, 2, 3, ..., 31` and is `{1, 0, 0, 0, 1, 0, 0, 0, ..., 1, 0, 0, 0`, respectively. The corresponding output `aggregate` in threads 0, 4, 8, etc. will be `6`, `22`, `38`, etc. (and is undefined in other threads).
+
+```cpp showLineNumbers={false}
+#include
+
+__global__ void ExampleKernel(...)
+{
+ // Specialize WarpReduce for type int
+ using WarpReduce = cub::WarpReduce;
+
+ // Allocate WarpReduce shared memory for one warp
+ __shared__ typename WarpReduce::TempStorage temp_storage;
+
+ // Obtain one input item and flag per thread
+ int thread_data = ...
+ int head_flag = ...
+
+ // Return the warp-wide sums to each lane0
+ int aggregate = WarpReduce(temp_storage).HeadSegmentedSum(
+ thread_data, head_flag);
+}
+```
+
+### TailSegmentedSum inline nodiscard
+
+Computes a segmented sum in the calling warp where segments are defined by tail-flags. The sum of each segment is returned to the first lane in that segment (which always includes *lane*0).
+
+
+```cpp showLineNumbers={false}
+template
+T cub::WarpReduce::TailSegmentedSum(
+ T input,
+ FlagT tail_flag
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Parameters**
+
+
+Calling thread's input
+
+
+
+Head flag denoting whether or not `input` is the start of a new segment
+
+
+**Example**
+
+The code snippet below illustrates a tail-segmented warp sum reduction within a block of 32 threads (one warp).
+
+is `{0, 1, 2, 3, ..., 31}` and is `{0, 0, 0, 1, 0, 0, 0, 1, ..., 0, 0, 0, 1}`, respectively. The corresponding output `aggregate` in threads 0, 4, 8, etc. will be `6`, `22`, `38`, etc. (and is undefined in other threads).
+
+```cpp showLineNumbers={false}
+#include
+
+__global__ void ExampleKernel(...)
+{
+ // Specialize WarpReduce for type int
+ using WarpReduce = cub::WarpReduce;
+
+ // Allocate WarpReduce shared memory for one warp
+ __shared__ typename WarpReduce::TempStorage temp_storage;
+
+ // Obtain one input item and flag per thread
+ int thread_data = ...
+ int tail_flag = ...
+
+ // Return the warp-wide sums to each lane0
+ int aggregate = WarpReduce(temp_storage).TailSegmentedSum(
+ thread_data, tail_flag);
+```
+
+---
+
+## Generic reductions
+
+### Reduce inline nodiscard
+
+
+
+
+Computes a warp-wide reduction in the calling warp using the specified binary reduction functor. The output is valid in warp *lane*0.
+
+Supports non-commutative reduction operators
+
+
+```cpp showLineNumbers={false}
+template
+T cub::WarpReduce::Reduce(
+ T input,
+ ReductionOp reduction_op
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Template parameters**
+
+
+**[inferred]** Binary reduction operator type having member `T operator()(const T &a, const T &b)`
+
+
+**Parameters**
+
+
+Calling thread's input
+
+
+
+Binary reduction operator
+
+
+**Example**
+
+The code snippet below illustrates four concurrent warp max reductions within a block of 128 threads (one per each of the 32-thread warps).
+
+`{0, 1, 2, 3, ..., 127}`. The corresponding output `aggregate` in threads 0, 32, 64, and 96 will be `31`, `63`, `95`, and `127`, respectively (and is undefined in other threads).
+
+```cpp showLineNumbers={false}
+#include
+
+__global__ void ExampleKernel(...)
+{
+ // Specialize WarpReduce for type int
+ using WarpReduce = cub::WarpReduce;
+
+ // Allocate WarpReduce shared memory for 4 warps
+ __shared__ typename WarpReduce::TempStorage temp_storage[4];
+
+ // Obtain one input item per thread
+ int thread_data = ...
+
+ // Return the warp-wide reductions to each lane0
+ int warp_id = threadIdx.x / 32;
+ int aggregate = WarpReduce(temp_storage[warp_id]).Reduce(
+ thread_data, cuda::maximum<>{});
+```
+
+
+
+
+
+```cpp showLineNumbers={false}
+template
+T cub::WarpReduce::Reduce(
+ const InputType &input,
+ ReductionOp reduction_op
+)
+```
+
+
+
+
+
+Computes a partially-full warp-wide reduction in the calling warp using the specified binary reduction functor. The output is valid in warp *lane*0.
+
+All threads across the calling warp must agree on the same value for `valid_items`. Otherwise the result is undefined.
+
+Supports non-commutative reduction operators
+
+
+```cpp showLineNumbers={false}
+template
+T cub::WarpReduce::Reduce(
+ T input,
+ ReductionOp reduction_op,
+ int valid_items
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Template parameters**
+
+
+**[inferred]** Binary reduction operator type having member `T operator()(const T &a, const T &b)`
+
+
+**Parameters**
+
+
+Calling thread's input
+
+
+
+Binary reduction operator
+
+
+
+Total number of valid items in the calling thread's logical warp (may be less than `LogicalWarpThreads`)
+
+
+**Example**
+
+The code snippet below illustrates a max reduction within a single, partially-full block of 32 threads (one warp).
+
+is `4`. The corresponding output `aggregate` in thread0 is `3` (and is undefined in other threads).
+
+```cpp showLineNumbers={false}
+#include
+
+__global__ void ExampleKernel(int *d_data, int valid_items)
+{
+ // Specialize WarpReduce for type int
+ using WarpReduce = cub::WarpReduce;
+
+ // Allocate WarpReduce shared memory for one warp
+ __shared__ typename WarpReduce::TempStorage temp_storage;
+
+ // Obtain one input item per thread if in range
+ int thread_data;
+ if (threadIdx.x < valid_items)
+ thread_data = d_data[threadIdx.x];
+
+ // Return the warp-wide reductions to each lane0
+ int aggregate = WarpReduce(temp_storage).Reduce(
+ thread_data, cuda::maximum<>{}, valid_items);
+```
+
+
+
+
+### HeadSegmentedReduce inline nodiscard
+
+Computes a segmented reduction in the calling warp where segments are defined by head-flags. The reduction of each segment is returned to the first lane in that segment (which always includes *lane*0).
+
+Supports non-commutative reduction operators
+
+
+```cpp showLineNumbers={false}
+template
+T cub::WarpReduce::HeadSegmentedReduce(
+ T input,
+ FlagT head_flag,
+ ReductionOp reduction_op
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Template parameters**
+
+
+**[inferred]** Binary reduction operator type having member `T operator()(const T &a, const T &b)`
+
+
+**Parameters**
+
+
+Calling thread's input
+
+
+
+Head flag denoting whether or not `input` is the start of a new segment
+
+
+
+Reduction operator
+
+
+**Example**
+
+The code snippet below illustrates a head-segmented warp max reduction within a block of 32 threads (one warp).
+
+is `{0, 1, 2, 3, ..., 31}` and is `{1, 0, 0, 0, 1, 0, 0, 0, ..., 1, 0, 0, 0}`, respectively. The corresponding output `aggregate` in threads 0, 4, 8, etc. will be `3`, `7`, `11`, etc. (and is undefined in other threads).
+
+```cpp showLineNumbers={false}
+#include
+
+__global__ void ExampleKernel(...)
+{
+ // Specialize WarpReduce for type int
+ using WarpReduce = cub::WarpReduce;
+
+ // Allocate WarpReduce shared memory for one warp
+ __shared__ typename WarpReduce::TempStorage temp_storage;
+
+ // Obtain one input item and flag per thread
+ int thread_data = ...
+ int head_flag = ...
+
+ // Return the warp-wide reductions to each lane0
+ int aggregate = WarpReduce(temp_storage).HeadSegmentedReduce(
+ thread_data, head_flag, cuda::maximum<>{});
+```
+
+### TailSegmentedReduce inline nodiscard
+
+Computes a segmented reduction in the calling warp where segments are defined by tail-flags. The reduction of each segment is returned to the first lane in that segment (which always includes *lane*0).
+
+Supports non-commutative reduction operators
+
+
+```cpp showLineNumbers={false}
+template
+T cub::WarpReduce::TailSegmentedReduce(
+ T input,
+ FlagT tail_flag,
+ ReductionOp reduction_op
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Template parameters**
+
+
+**[inferred]** Binary reduction operator type having member `T operator()(const T &a, const T &b)`
+
+
+**Parameters**
+
+
+Calling thread's input
+
+
+
+Tail flag denoting whether or not `input` is the end of the current segment
+
+
+
+Reduction operator
+
+
+**Example**
+
+The code snippet below illustrates a tail-segmented warp max reduction within a block of 32 threads (one warp).
+
+is `{0, 1, 2, 3, ..., 31}` and is `{0, 0, 0, 1, 0, 0, 0, 1, ..., 0, 0, 0, 1}`, respectively. The corresponding output `aggregate` in threads 0, 4, 8, etc. will be `3`, `7`, `11`, etc. (and is undefined in other threads).
+
+```cpp showLineNumbers={false}
+#include
+
+__global__ void ExampleKernel(...)
+{
+ // Specialize WarpReduce for type int
+ using WarpReduce = cub::WarpReduce;
+
+ // Allocate WarpReduce shared memory for one warp
+ __shared__ typename WarpReduce::TempStorage temp_storage;
+
+ // Obtain one input item and flag per thread
+ int thread_data = ...
+ int tail_flag = ...
+
+ // Return the warp-wide reductions to each lane0
+ int aggregate = WarpReduce(temp_storage).TailSegmentedReduce(
+ thread_data, tail_flag, cuda::maximum<>{});
+```
+
+---
+
+## Types
+
+### Typedefs
+
+| Name | Definition | Description |
+|---|---|---|
+| `_TempStorage` | `typename InternalWarpReduce::TempStorage` | Shared memory storage layout type for `WarpReduce`. |
+
+---
+
+## Member variables
+
+| Name | Type | Description |
+|---|---|---|
+| `is_full_warp` static constexpr | `bool` | |
+| `is_power_of_two` static constexpr | `bool` | |
+| `temp_storage` | `_TempStorage &` | Shared storage reference. |
+
+---
+
+## Inner classes
+
+### TempStorage
+
+
+```cpp showLineNumbers={false}
+struct cub::WarpReduce::TempStorage
+```
+
+
+The operations exposed by `WarpReduce` require a temporary memory allocation of this nested type for thread communication. This opaque storage can be allocated directly using the `__shared__` keyword. Alternatively, it can be aliased to externally allocated memory (shared or global) or `union`'d with other storage allocation types to facilitate memory reuse.
+
+**Inherits from:** `Uninitialized< _TempStorage >` (public)
diff --git a/fern/cudapages/cub/cub/cub/WarpScan.mdx b/fern/cudapages/cub/cub/cub/WarpScan.mdx
new file mode 100644
index 0000000..c02b2f8
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/WarpScan.mdx
@@ -0,0 +1,1184 @@
+---
+title: cub::WarpScan
+description: ""
+---
+
+The WarpScan class provides collective methods for computing a parallel prefix scan of items partitioned across a CUDA thread warp.
+
+
+
+## Performance considerations
+
+* Uses special instructions when applicable (e.g., warp `SHFL`)
+* Uses synchronization-free communication between warp lanes when applicable
+* Incurs zero bank conflicts for most types
+* Computation is slightly more efficient (i.e., having lower instruction overhead) for:
+
+ * Summation (**vs.** generic scan)
+ * The architecture's warp size is a whole multiple of `LOGICAL_WARP_THREADS`
+
+## Example
+
+The code snippet below illustrates four concurrent warp prefix sums within a block of 128 threads (one per each of the 32-thread warps).
+
+`{1, 1, 1, 1, ...}`. The corresponding output `thread_data` in each of the four warps of threads will be `0, 1, 2, 3, ..., 31}`.
+
+The code snippet below illustrates a single warp prefix sum within a block of 128 threads.
+
+`{1, 1, 1, 1, ...}`. The corresponding output `thread_data` will be `{0, 1, 2, 3, ..., 31}`.
+
+```cpp showLineNumbers={false}
+#include
+
+__global__ void ExampleKernel(...)
+{
+ // Specialize WarpScan for type int
+ using WarpScan = cub::WarpScan;
+
+ // Allocate WarpScan shared memory for 4 warps
+ __shared__ typename WarpScan::TempStorage temp_storage[4];
+
+ // Obtain one input item per thread
+ int thread_data = ...
+
+ // Compute warp-wide prefix sums
+ int warp_id = threadIdx.x / 32;
+ WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data);
+}
+
+#include
+
+__global__ void ExampleKernel(...)
+{
+ // Specialize WarpScan for type int
+ using WarpScan = cub::WarpScan;
+
+ // Allocate WarpScan shared memory for one warp
+ __shared__ typename WarpScan::TempStorage temp_storage;
+ ...
+
+ // Only the first warp performs a prefix sum
+ if (threadIdx.x < 32)
+ {
+ // Obtain one input item per thread
+ int thread_data = ...
+
+ // Compute warp-wide prefix sums
+ WarpScan(temp_storage).ExclusiveSum(thread_data, thread_data);
+ }
+}
+```
+
+
+
+
+
+The scan input/output element type
+
+
+
+**[optional]** The number of threads per "logical" warp (may be less than the number of hardware warp threads). Default is the warp size associated with the CUDA Compute Capability targeted by the compiler (e.g., 32 threads for SM20).
+
+
+
+
+
+---
+
+## Collective constructors
+
+### WarpScan inline
+
+Collective constructor using the specified memory allocation as temporary storage.
+
+Logical warp and lane identifiers are constructed from `threadIdx.x`.
+
+
+```cpp showLineNumbers={false}
+cub::WarpScan::WarpScan(
+ TempStorage &temp_storage
+)
+```
+
+
+**Parameters**
+
+
+Reference to memory allocation having layout type [TempStorage](/library/api/cub::WarpScan::TempStorage)
+
+
+---
+
+## Inclusive prefix sums
+
+### InclusiveSum inline
+
+
+
+
+Computes an inclusive prefix sum across the calling warp.
+
+
+```cpp showLineNumbers={false}
+void cub::WarpScan::InclusiveSum(
+ T input,
+ T &inclusive_output
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+* The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Parameters**
+
+
+Calling thread's input item.
+
+
+
+Calling thread's output item. May be aliased with `input`.
+
+
+**Example**
+
+The code snippet below illustrates four concurrent warp-wide inclusive prefix sums within a block of 128 threads (one per each of the 32-thread warps).
+
+`{1, 1, 1, 1, ...}`. The corresponding output `thread_data` in each of the four warps of threads will be `1, 2, 3, ..., 32}`.
+
+```cpp showLineNumbers={false}
+#include
+
+__global__ void ExampleKernel(...)
+{
+ // Specialize WarpScan for type int
+ using WarpScan = cub::WarpScan;
+
+ // Allocate WarpScan shared memory for 4 warps
+ __shared__ typename WarpScan::TempStorage temp_storage[4];
+
+ // Obtain one input item per thread
+ int thread_data = ...
+
+ // Compute inclusive warp-wide prefix sums
+ int warp_id = threadIdx.x / 32;
+ WarpScan(temp_storage[warp_id]).InclusiveSum(thread_data, thread_data);
+}
+```
+
+
+
+
+Computes an inclusive prefix sum across the calling warp. Also provides every thread with the warp-wide `warp_aggregate` of all inputs.
+
+
+```cpp showLineNumbers={false}
+void cub::WarpScan::InclusiveSum(
+ T input,
+ T &inclusive_output,
+ T &warp_aggregate
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+* The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Parameters**
+
+
+Calling thread's input item
+
+
+
+Calling thread's output item. May be aliased with `input`
+
+
+
+Warp-wide aggregate reduction of input items
+
+
+**Example**
+
+The code snippet below illustrates four concurrent warp-wide inclusive prefix sums within a block of 128 threads (one per each of the 32-thread warps).
+
+`{1, 1, 1, 1, ...}`. The corresponding output `thread_data` in each of the four warps of threads will be `1, 2, 3, ..., 32}`. Furthermore, `warp_aggregate` for all threads in all warps will be `32`.
+
+```cpp showLineNumbers={false}
+#include
+
+__global__ void ExampleKernel(...)
+{
+ // Specialize WarpScan for type int
+ using WarpScan = cub::WarpScan;
+
+ // Allocate WarpScan shared memory for 4 warps
+ __shared__ typename WarpScan::TempStorage temp_storage[4];
+
+ // Obtain one input item per thread
+ int thread_data = ...
+
+ // Compute inclusive warp-wide prefix sums
+ int warp_aggregate;
+ int warp_id = threadIdx.x / 32;
+ WarpScan(temp_storage[warp_id]).InclusiveSum(thread_data, thread_data, warp_aggregate);
+}
+```
+
+
+
+
+---
+
+## Exclusive prefix sums
+
+### ExclusiveSum inline
+
+
+
+
+Computes an exclusive prefix sum across the calling warp. The value of 0 is applied as the initial value, and is assigned to `exclusive_output` in *lane*0.
+
+
+```cpp showLineNumbers={false}
+void cub::WarpScan::ExclusiveSum(
+ T input,
+ T &exclusive_output
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+* Uses the identity element (zero) as the initial value. * The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Parameters**
+
+
+Calling thread's input item.
+
+
+
+Calling thread's output item. May be aliased with `input`.
+
+
+**Example**
+
+The code snippet below illustrates four concurrent warp-wide exclusive prefix sums within a block of 128 threads (one per each of the 32-thread warps).
+
+`{1, 1, 1, 1, ...}`. The corresponding output `thread_data` in each of the four warps of threads will be `0, 1, 2, ..., 31}`.
+
+```cpp showLineNumbers={false}
+#include
+
+__global__ void ExampleKernel(...)
+{
+ // Specialize WarpScan for type int
+ using WarpScan = cub::WarpScan;
+
+ // Allocate WarpScan shared memory for 4 warps
+ __shared__ typename WarpScan::TempStorage temp_storage[4];
+
+ // Obtain one input item per thread
+ int thread_data = ...
+
+ // Compute exclusive warp-wide prefix sums
+ int warp_id = threadIdx.x / 32;
+ WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data);
+}
+```
+
+
+
+
+Computes an exclusive prefix sum across the calling warp. The value of 0 is applied as the initial value, and is assigned to `exclusive_output` in *lane*0. Also provides every thread with the warp-wide `warp_aggregate` of all inputs.
+
+
+```cpp showLineNumbers={false}
+void cub::WarpScan::ExclusiveSum(
+ T input,
+ T &exclusive_output,
+ T &warp_aggregate
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+* Uses the identity element (zero) as the initial value. * The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Parameters**
+
+
+Calling thread's input item
+
+
+
+Calling thread's output item. May be aliased with `input`
+
+
+
+Warp-wide aggregate reduction of input items
+
+
+**Example**
+
+The code snippet below illustrates four concurrent warp-wide exclusive prefix sums within a block of 128 threads (one per each of the 32-thread warps).
+
+`{1, 1, 1, 1, ...}`. The corresponding output `thread_data` in each of the four warps of threads will be `0, 1, 2, ..., 31}`. Furthermore, `warp_aggregate` for all threads in all warps will be `32`.
+
+```cpp showLineNumbers={false}
+#include
+
+__global__ void ExampleKernel(...)
+{
+ // Specialize WarpScan for type int
+ using WarpScan = cub::WarpScan;
+
+ // Allocate WarpScan shared memory for 4 warps
+ __shared__ typename WarpScan::TempStorage temp_storage[4];
+
+ // Obtain one input item per thread
+ int thread_data = ...
+
+ // Compute exclusive warp-wide prefix sums
+ int warp_aggregate;
+ int warp_id = threadIdx.x / 32;
+ WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data,
+ thread_data,
+ warp_aggregate);
+```
+
+
+
+
+---
+
+## Inclusive prefix scans
+
+### InclusiveScan inline
+
+
+
+
+Computes an inclusive prefix scan using the specified binary scan functor across the calling warp.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::WarpScan::InclusiveScan(
+ T input,
+ T &inclusive_output,
+ ScanOp scan_op
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+* The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Template parameters**
+
+
+**[inferred]** Binary scan operator type having member `T operator()(const T &a, const T &b)`
+
+
+**Parameters**
+
+
+Calling thread's input item
+
+
+
+Calling thread's output item. May be aliased with `input`
+
+
+
+Binary scan operator
+
+
+**Example**
+
+The code snippet below illustrates four concurrent warp-wide inclusive prefix max scans within a block of 128 threads (one per each of the 32-thread warps).
+
+`{0, -1, 2, -3, ..., 126, -127}`. The corresponding output `thread_data` in the first warp would be `0, 0, 2, 2, ..., 30, 30`, the output for the second warp would be `32, 32, 34, 34, ..., 62, 62`, etc.
+
+```cpp showLineNumbers={false}
+#include
+
+__global__ void ExampleKernel(...)
+{
+ // Specialize WarpScan for type int
+ using WarpScan = cub::WarpScan;
+
+ // Allocate WarpScan shared memory for 4 warps
+ __shared__ typename WarpScan::TempStorage temp_storage[4];
+
+ // Obtain one input item per thread
+ int thread_data = ...
+
+ // Compute inclusive warp-wide prefix max scans
+ int warp_id = threadIdx.x / 32;
+ WarpScan(temp_storage[warp_id]).InclusiveScan(thread_data, thread_data, cuda::maximum<>{});
+```
+
+
+
+
+Computes an inclusive prefix scan using the specified binary scan functor across the calling warp.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::WarpScan::InclusiveScan(
+ T input,
+ T &inclusive_output,
+ T initial_value,
+ ScanOp scan_op
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+* The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Template parameters**
+
+
+**[inferred]** Binary scan operator type having member `T operator()(const T &a, const T &b)`
+
+
+**Parameters**
+
+
+Calling thread's input item
+
+
+
+Calling thread's output item. May be aliased with `input`
+
+
+
+Initial value to seed the inclusive scan (uniform across warp)
+
+
+
+Binary scan operator
+
+
+
+
+
+Computes an inclusive prefix scan using the specified binary scan functor across the calling warp. Also provides every thread with the warp-wide `warp_aggregate` of all inputs.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::WarpScan::InclusiveScan(
+ T input,
+ T &inclusive_output,
+ ScanOp scan_op,
+ T &warp_aggregate
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+* The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Template parameters**
+
+
+**[inferred]** Binary scan operator type having member `T operator()(const T &a, const T &b)`
+
+
+**Parameters**
+
+
+Calling thread's input item
+
+
+
+Calling thread's output item. May be aliased with `input`
+
+
+
+Binary scan operator
+
+
+
+Warp-wide aggregate reduction of input items.
+
+
+**Example**
+
+The code snippet below illustrates four concurrent warp-wide inclusive prefix max scans within a block of 128 threads (one per each of the 32-thread warps).
+
+`{0, -1, 2, -3, ..., 126, -127}`. The corresponding output `thread_data` in the first warp would be `0, 0, 2, 2, ..., 30, 30`, the output for the second warp would be `32, 32, 34, 34, ..., 62, 62`, etc. Furthermore, `warp_aggregate` would be assigned `30` for threads in the first warp, `62` for threads in the second warp, etc.
+
+```cpp showLineNumbers={false}
+#include
+
+__global__ void ExampleKernel(...)
+{
+ // Specialize WarpScan for type int
+ using WarpScan = cub::WarpScan;
+
+ // Allocate WarpScan shared memory for 4 warps
+ __shared__ typename WarpScan::TempStorage temp_storage[4];
+
+ // Obtain one input item per thread
+ int thread_data = ...
+
+ // Compute inclusive warp-wide prefix max scans
+ int warp_aggregate;
+ int warp_id = threadIdx.x / 32;
+ WarpScan(temp_storage[warp_id]).InclusiveScan(
+ thread_data, thread_data, cuda::maximum<>{}, warp_aggregate);
+```
+
+
+
+
+Computes an inclusive prefix scan using the specified binary scan functor across the calling warp. Also provides every thread with the warp-wide `warp_aggregate` of all inputs.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::WarpScan::InclusiveScan(
+ T input,
+ T &inclusive_output,
+ T initial_value,
+ ScanOp scan_op,
+ T &warp_aggregate
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+* The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Template parameters**
+
+
+**[inferred]** Binary scan operator type having member `T operator()(const T &a, const T &b)`
+
+
+**Parameters**
+
+
+Calling thread's input item
+
+
+
+Calling thread's output item. May be aliased with `input`
+
+
+
+Initial value to seed the inclusive scan (uniform across warp). It is not taken into account for warp_aggregate.
+
+
+
+Binary scan operator
+
+
+
+Warp-wide aggregate reduction of input items.
+
+
+
+
+
+---
+
+## Exclusive prefix scans
+
+### ExclusiveScan inline
+
+
+
+
+Computes an exclusive prefix scan using the specified binary scan functor across the calling warp. Because no initial value is supplied, the `output` computed for *lane*0 is undefined.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::WarpScan::ExclusiveScan(
+ T input,
+ T &exclusive_output,
+ ScanOp scan_op
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+* The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Template parameters**
+
+
+**[inferred]** Binary scan operator type having member `T operator()(const T &a, const T &b)`
+
+
+**Parameters**
+
+
+Calling thread's input item
+
+
+
+Calling thread's output item. May be aliased with `input`
+
+
+
+Binary scan operator
+
+
+**Example**
+
+The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of 128 threads (one per each of the 32-thread warps).
+
+`{0, -1, 2, -3, ..., 126, -127}`. The corresponding output `thread_data` in the first warp would be `?, 0, 0, 2, ..., 28, 30`, the output for the second warp would be `?, 32, 32, 34, ..., 60, 62`, etc. (The output `thread_data` in warp *lane*0 is undefined.)
+
+```cpp showLineNumbers={false}
+#include
+
+__global__ void ExampleKernel(...)
+{
+ // Specialize WarpScan for type int
+ using WarpScan = cub::WarpScan;
+
+ // Allocate WarpScan shared memory for 4 warps
+ __shared__ typename WarpScan::TempStorage temp_storage[4];
+
+ // Obtain one input item per thread
+ int thread_data = ...
+
+ // Compute exclusive warp-wide prefix max scans
+ int warp_id = threadIdx.x / 32;
+ WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, cuda::maximum<>{});
+```
+
+
+
+
+Computes an exclusive prefix scan using the specified binary scan functor across the calling warp.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::WarpScan::ExclusiveScan(
+ T input,
+ T &exclusive_output,
+ T initial_value,
+ ScanOp scan_op
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+* The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Template parameters**
+
+
+**[inferred]** Binary scan operator type having member `T operator()(const T &a, const T &b)`
+
+
+**Parameters**
+
+
+Calling thread's input item
+
+
+
+Calling thread's output item. May be aliased with `input`
+
+
+
+Initial value to seed the exclusive scan
+
+
+
+Binary scan operator
+
+
+**Example**
+
+The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of 128 threads (one per each of the 32-thread warps).
+
+`{0, -1, 2, -3, ..., 126, -127}`. The corresponding output `thread_data` in the first warp would be `INT_MIN, 0, 0, 2, ..., 28, 30`, the output for the second warp would be `30, 32, 32, 34, ..., 60, 62`, etc.
+
+```cpp showLineNumbers={false}
+#include
+
+__global__ void ExampleKernel(...)
+{
+ // Specialize WarpScan for type int
+ using WarpScan = cub::WarpScan;
+
+ // Allocate WarpScan shared memory for 4 warps
+ __shared__ typename WarpScan::TempStorage temp_storage[4];
+
+ // Obtain one input item per thread
+ int thread_data = ...
+
+ // Compute exclusive warp-wide prefix max scans
+ int warp_id = threadIdx.x / 32;
+ WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data,
+ thread_data,
+ INT_MIN,
+ cuda::maximum<>{});
+```
+
+
+
+
+Computes an exclusive prefix scan using the specified binary scan functor across the calling warp. Because no initial value is supplied, the `output` computed for *lane*0 is undefined. Also provides every thread with the warp-wide `warp_aggregate` of all inputs.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::WarpScan::ExclusiveScan(
+ T input,
+ T &exclusive_output,
+ ScanOp scan_op,
+ T &warp_aggregate
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+* The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Template parameters**
+
+
+**[inferred]** Binary scan operator type having member `T operator()(const T &a, const T &b)`
+
+
+**Parameters**
+
+
+Calling thread's input item
+
+
+
+Calling thread's output item. May be aliased with `input`
+
+
+
+Binary scan operator
+
+
+
+Warp-wide aggregate reduction of input items
+
+
+**Example**
+
+The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of 128 threads (one per each of the 32-thread warps).
+
+`{0, -1, 2, -3, ..., 126, -127}`. The corresponding output `thread_data` in the first warp would be `?, 0, 0, 2, ..., 28, 30`, the output for the second warp would be `?, 32, 32, 34, ..., 60, 62`, etc. (The output `thread_data` in warp *lane*0 is undefined). Furthermore, `warp_aggregate` would be assigned `30` for threads in the first warp, \p 62 for threads in the second warp, etc.
+
+```cpp showLineNumbers={false}
+#include
+
+__global__ void ExampleKernel(...)
+{
+ // Specialize WarpScan for type int
+ using WarpScan = cub::WarpScan;
+
+ // Allocate WarpScan shared memory for 4 warps
+ __shared__ typename WarpScan::TempStorage temp_storage[4];
+
+ // Obtain one input item per thread
+ int thread_data = ...
+
+ // Compute exclusive warp-wide prefix max scans
+ int warp_aggregate;
+ int warp_id = threadIdx.x / 32;
+ WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data,
+ thread_data,
+ cuda::maximum<>{},
+ warp_aggregate);
+```
+
+
+
+
+Computes an exclusive prefix scan using the specified binary scan functor across the calling warp. Also provides every thread with the warp-wide `warp_aggregate` of all inputs.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::WarpScan::ExclusiveScan(
+ T input,
+ T &exclusive_output,
+ T initial_value,
+ ScanOp scan_op,
+ T &warp_aggregate
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+* The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Template parameters**
+
+
+**[inferred]** Binary scan operator type having member `T operator()(const T &a, const T &b)`
+
+
+**Parameters**
+
+
+Calling thread's input item
+
+
+
+Calling thread's output item. May be aliased with `input`
+
+
+
+Initial value to seed the exclusive scan
+
+
+
+Binary scan operator
+
+
+
+Warp-wide aggregate reduction of input items
+
+
+**Example**
+
+The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of 128 threads (one per each of the 32-thread warps).
+
+`{0, -1, 2, -3, ..., 126, -127}`. The corresponding output `thread_data` in the first warp would be `INT_MIN, 0, 0, 2, ..., 28, 30`, the output for the second warp would be `INT_MIN, 32, 32, 34, ..., 60, 62`, etc. Furthermore, `warp_aggregate` would be assigned `30` for threads in the first warp, `62` for threads in the second warp, etc.
+
+```cpp showLineNumbers={false}
+#include
+
+__global__ void ExampleKernel(...)
+{
+ // Specialize WarpScan for type int
+ using WarpScan = cub::WarpScan;
+
+ // Allocate WarpScan shared memory for 4 warps
+ __shared__ typename WarpScan::TempStorage temp_storage[4];
+
+ // Obtain one input item per thread
+ int thread_data = ...
+
+ // Compute exclusive warp-wide prefix max scans
+ int warp_aggregate;
+ int warp_id = threadIdx.x / 32;
+ WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data,
+ thread_data,
+ INT_MIN,
+ cuda::maximum<>{},
+ warp_aggregate);
+```
+
+
+
+
+---
+
+## Combination (inclusive & exclusive) prefix scans
+
+### Scan inline
+
+
+
+
+Computes both inclusive and exclusive prefix scans using the specified binary scan functor across the calling warp. Because no initial value is supplied, the `exclusive_output` computed for *lane*0 is undefined.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::WarpScan::Scan(
+ T input,
+ T &inclusive_output,
+ T &exclusive_output,
+ ScanOp scan_op
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+* The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Template parameters**
+
+
+**[inferred]** Binary scan operator type having member `T operator()(const T &a, const T &b)`
+
+
+**Parameters**
+
+
+Calling thread's input item
+
+
+
+Calling thread's inclusive-scan output item
+
+
+
+Calling thread's exclusive-scan output item
+
+
+
+Binary scan operator
+
+
+**Example**
+
+The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of 128 threads (one per each of the 32-thread warps).
+
+`{0, -1, 2, -3, ..., 126, -127}`. The corresponding output `inclusive_partial` in the first warp would be `0, 0, 2, 2, ..., 30, 30`, the output for the second warp would be `32, 32, 34, 34, ..., 62, 62`, etc. The corresponding output `exclusive_partial` in the first warp would be `?, 0, 0, 2, ..., 28, 30`, the output for the second warp would be `?, 32, 32, 34, ..., 60, 62`, etc. (The output `thread_data` in warp *lane*0 is undefined.)
+
+```cpp showLineNumbers={false}
+#include
+
+__global__ void ExampleKernel(...)
+{
+ // Specialize WarpScan for type int
+ using WarpScan = cub::WarpScan;
+
+ // Allocate WarpScan shared memory for 4 warps
+ __shared__ typename WarpScan::TempStorage temp_storage[4];
+
+ // Obtain one input item per thread
+ int thread_data = ...
+
+ // Compute exclusive warp-wide prefix max scans
+ int inclusive_partial, exclusive_partial;
+ WarpScan(temp_storage[warp_id]).Scan(thread_data,
+ inclusive_partial,
+ exclusive_partial,
+ cuda::maximum<>{});
+```
+
+
+
+
+Computes both inclusive and exclusive prefix scans using the specified binary scan functor across the calling warp.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::WarpScan::Scan(
+ T input,
+ T &inclusive_output,
+ T &exclusive_output,
+ T initial_value,
+ ScanOp scan_op
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+* The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Template parameters**
+
+
+**[inferred]** Binary scan operator type having member `T operator()(const T &a, const T &b)`
+
+
+**Parameters**
+
+
+Calling thread's input item
+
+
+
+Calling thread's inclusive-scan output item
+
+
+
+Calling thread's exclusive-scan output item
+
+
+
+Initial value to seed the exclusive scan
+
+
+
+Binary scan operator
+
+
+**Example**
+
+The code snippet below illustrates four concurrent warp-wide prefix max scans within a block of 128 threads (one per each of the 32-thread warps).
+
+`{0, -1, 2, -3, ..., 126, -127}`. The corresponding output `inclusive_partial` in the first warp would be `0, 0, 2, 2, ..., 30, 30`, the output for the second warp would be `32, 32, 34, 34, ..., 62, 62`, etc. The corresponding output `exclusive_partial` in the first warp would be `INT_MIN, 0, 0, 2, ..., 28, 30`, the output for the second warp would be `INT_MIN, 32, 32, 34, ..., 60, 62`, etc.
+
+```cpp showLineNumbers={false}
+#include
+
+__global__ void ExampleKernel(...)
+{
+ // Specialize WarpScan for type int
+ using WarpScan = cub::WarpScan;
+
+ // Allocate WarpScan shared memory for 4 warps
+ __shared__ typename WarpScan::TempStorage temp_storage[4];
+
+ // Obtain one input item per thread
+ int thread_data = ...
+
+ // Compute inclusive warp-wide prefix max scans
+ int warp_id = threadIdx.x / 32;
+ int inclusive_partial, exclusive_partial;
+ WarpScan(temp_storage[warp_id]).Scan(thread_data,
+ inclusive_partial,
+ exclusive_partial,
+ INT_MIN,
+ cuda::maximum<>{});
+```
+
+
+
+
+---
+
+## Data exchange
+
+### Broadcast inline
+
+Broadcast the value `input` from *lane*src_lane to all lanes in the warp
+
+
+```cpp showLineNumbers={false}
+T cub::WarpScan::Broadcast(
+ T input,
+ unsigned int src_lane
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+* The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Parameters**
+
+
+The value to broadcast
+
+
+
+Which warp lane is to do the broadcasting
+
+
+**Example**
+
+The code snippet below illustrates the warp-wide broadcasts of values from *lane*0 in each of four warps to all other threads in those warps.
+
+`{0, 1, 2, 3, ..., 127}`. The corresponding output `thread_data` will be `{0, 0, ..., 0}` in warp0, `{32, 32, ..., 32}` in warp1, `{64, 64, ..., 64}` in warp2, etc.
+
+```cpp showLineNumbers={false}
+#include
+
+__global__ void ExampleKernel(...)
+{
+ // Specialize WarpScan for type int
+ using WarpScan = cub::WarpScan;
+
+ // Allocate WarpScan shared memory for 4 warps
+ __shared__ typename WarpScan::TempStorage temp_storage[4];
+
+ // Obtain one input item per thread
+ int thread_data = ...
+
+ // Broadcast from lane0 in each warp to all other threads in the warp
+ int warp_id = threadIdx.x / 32;
+ thread_data = WarpScan(temp_storage[warp_id]).Broadcast(thread_data, 0);
+```
+
+---
+
+## Types
+
+### Typedefs
+
+| Name | Definition | Description |
+|---|---|---|
+| `InternalWarpScan` | `::cuda::std:: _If< IS_POW_OF_TWO, detail::WarpScanShfl< T, LOGICAL_WARP_THREADS >, detail::WarpScanSmem< T, LOGICAL_WARP_THREADS > >` | Internal specialization. |
+| `_TempStorage` | `typename InternalWarpScan::TempStorage` | Shared memory storage layout type for `WarpScan`. |
+
+---
+
+## Member variables
+
+| Name | Type | Description |
+|---|---|---|
+| `IS_ARCH_WARP` static constexpr | `bool` | Whether the logical warp size and the PTX warp size coincide. |
+| `IS_POW_OF_TWO` static constexpr | `bool` | Whether the logical warp size is a power-of-two. |
+| `IS_INTEGER` static constexpr | `bool` | Whether the data type is an integer (which has fully-associative addition). |
+| `temp_storage` | `_TempStorage &` | Shared storage reference. |
+| `lane_id` | `unsigned int` | |
+
+---
+
+## Inner classes
+
+### TempStorage
+
+
+```cpp showLineNumbers={false}
+struct cub::WarpScan::TempStorage
+```
+
+
+The operations exposed by `WarpScan` require a temporary memory allocation of this nested type for thread communication. This opaque storage can be allocated directly using the `__shared__` keyword. Alternatively, it can be aliased to externally allocated memory (shared or global) or `union`'d with other storage allocation types to facilitate memory reuse.
+
+**Inherits from:** `Uninitialized< _TempStorage >` (public)
diff --git a/fern/cudapages/cub/cub/cub/WarpStore.mdx b/fern/cudapages/cub/cub/cub/WarpStore.mdx
new file mode 100644
index 0000000..0fc6fcf
--- /dev/null
+++ b/fern/cudapages/cub/cub/cub/WarpStore.mdx
@@ -0,0 +1,353 @@
+---
+title: cub::WarpStore
+description: ""
+---
+
+The WarpStore class provides collective data movement methods for writing a blocked arrangement of items partitioned across a CUDA warp to a linear segment of memory.
+
+## Example
+
+The code snippet below illustrates the storing of a "blocked" arrangement of 64 integers across 16 threads (where each thread owns 4 consecutive items) into a linear segment of memory. The store is specialized for `WARP_STORE_TRANSPOSE`, meaning items are locally reordered among threads so that memory references will be efficiently coalesced using a warp-striped access pattern.
+
+`{ [0,1,2,3], [4,5,6,7], ..., [60,61,62,63] }`. The output `d_data` will be `0, 1, 2, 3, 4, 5, ...`.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+__global__ void ExampleKernel(int *d_data, ...)
+{
+ constexpr int warp_threads = 16;
+ constexpr int block_threads = 256;
+ constexpr int items_per_thread = 4;
+
+ // Specialize WarpStore for a virtual warp of 16 threads owning 4 integer items each
+ using WarpStoreT = WarpStore;
+
+ constexpr int warps_in_block = block_threads / warp_threads;
+ constexpr int tile_size = items_per_thread * warp_threads;
+ const int warp_id = static_cast(threadIdx.x) / warp_threads;
+
+ // Allocate shared memory for WarpStore
+ __shared__ typename WarpStoreT::TempStorage temp_storage[warps_in_block];
+
+ // Obtain a segment of consecutive items that are blocked across threads
+ int thread_data[4];
+ ...
+
+ // Store items to linear memory
+ WarpStoreT(temp_storage[warp_id]).Store(d_data + warp_id * tile_size, thread_data);
+}
+```
+
+
+
+
+
+The type of data to be written.
+
+
+
+The number of consecutive items partitioned onto each thread.
+
+
+
+**[optional]** [cub::WarpStoreAlgorithm](/library/api/cub::WarpStoreAlgorithm) tuning policy enumeration. default: [cub::WARP_STORE_DIRECT](/library/api/cub::WARP_STORE_DIRECT).
+
+
+
+**[optional]** The number of threads per "logical" warp (may be less than the number of hardware warp threads). Default is the warp size of the targeted CUDA compute-capability (e.g., 32 threads for SM86). Must be a power of two.
+
+
+
+
+
+---
+
+## Collective constructors
+
+### WarpStore inline
+
+
+
+
+Collective constructor using a private static allocation of shared memory as temporary storage.
+
+
+```cpp showLineNumbers={false}
+cub::WarpStore::WarpStore()
+```
+
+
+
+
+
+Collective constructor using the specified memory allocation as temporary storage.
+
+
+```cpp showLineNumbers={false}
+cub::WarpStore::WarpStore(
+ TempStorage &temp_storage
+)
+```
+
+
+
+
+
+---
+
+## Data movement
+
+### Store inline
+
+
+
+
+Store items into a linear segment of memory.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::WarpStore::Store(
+ OutputIteratorT block_itr,
+ T (&items)[ITEMS_PER_THREAD]
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Parameters**
+
+
+The thread block's base output iterator for storing to
+
+
+
+Data to store
+
+
+**Example**
+
+The code snippet below illustrates the storing of a "blocked" arrangement of 64 integers across 16 threads (where each thread owns 4 consecutive items) into a linear segment of memory. The store is specialized for `WARP_STORE_TRANSPOSE`, meaning items are locally reordered among threads so that memory references will be efficiently coalesced using a warp-striped access pattern.
+
+`{ [0,1,2,3], [4,5,6,7], ..., [60,61,62,63] }`. The output `d_data` will be `0, 1, 2, 3, 4, 5, ...`.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+__global__ void ExampleKernel(int *d_data, ...)
+{
+ constexpr int warp_threads = 16;
+ constexpr int block_threads = 256;
+ constexpr int items_per_thread = 4;
+
+ // Specialize WarpStore for a virtual warp of 16 threads owning 4 integer items each
+ using WarpStoreT = WarpStore;
+
+ constexpr int warps_in_block = block_threads / warp_threads;
+ constexpr int tile_size = items_per_thread * warp_threads;
+ const int warp_id = static_cast(threadIdx.x) / warp_threads;
+
+ // Allocate shared memory for WarpStore
+ __shared__ typename WarpStoreT::TempStorage temp_storage[warps_in_block];
+
+ // Obtain a segment of consecutive items that are blocked across threads
+ int thread_data[4];
+ ...
+
+ // Store items to linear memory
+ WarpStoreT(temp_storage[warp_id]).Store(d_data + warp_id * tile_size, thread_data);
+```
+
+
+
+
+Store items into a linear segment of memory, guarded by range.
+
+
+```cpp showLineNumbers={false}
+template
+void cub::WarpStore::Store(
+ OutputIteratorT block_itr,
+ T (&items)[ITEMS_PER_THREAD],
+ int valid_items
+)
+```
+
+
+*Added in v2.2.0. First appears in CUDA Toolkit 12.3.*
+
+
+The block-wide aggregate of `temp_storage` is undefined after calling this method and should not be used. To preserve the aggregate, use a separate `TempStorage` for each method call.
+
+
+**Parameters**
+
+
+The thread block's base output iterator for storing to
+
+
+
+Data to store
+
+
+
+Number of valid items to write
+
+
+**Example**
+
+The code snippet below illustrates the storing of a "blocked" arrangement of 64 integers across 16 threads (where each thread owns 4 consecutive items) into a linear segment of memory. The store is specialized for `WARP_STORE_TRANSPOSE`, meaning items are locally reordered among threads so that memory references will be efficiently coalesced using a warp-striped access pattern.
+
+`{ [0,1,2,3], [4,5,6,7], ..., [60,61,62,63] }` and `valid_items` is `5`. The output `d_data` will be `0, 1, 2, 3, 4, ?, ?, ...`, with only the first two threads being unmasked to store portions of valid data.
+
+```cpp showLineNumbers={false}
+#include // or equivalently
+
+__global__ void ExampleKernel(int *d_data, int valid_items ...)
+{
+ constexpr int warp_threads = 16;
+ constexpr int block_threads = 256;
+ constexpr int items_per_thread = 4;
+
+ // Specialize WarpStore for a virtual warp of 16 threads owning 4 integer items each
+ using WarpStoreT = WarpStore;
+
+ constexpr int warps_in_block = block_threads / warp_threads;
+ constexpr int tile_size = items_per_thread * warp_threads;
+ const int warp_id = static_cast(threadIdx.x) / warp_threads;
+
+ // Allocate shared memory for WarpStore
+ __shared__ typename WarpStoreT::TempStorage temp_storage[warps_in_block];
+
+ // Obtain a segment of consecutive items that are blocked across threads
+ int thread_data[4];
+ ...
+
+ // Store items to linear memory
+ WarpStoreT(temp_storage[warp_id]).Store(
+ d_data + warp_id * tile_size, thread_data, valid_items);
+```
+
+
+
+
+---
+
+## Utility methods
+
+### PrivateStorage inline
+
+
+```cpp showLineNumbers={false}
+_TempStorage & cub::WarpStore::PrivateStorage()
+```
+
+
+---
+
+## Types
+
+### Typedefs
+
+| Name | Definition | Description |
+|---|---|---|
+| `InternalStore` | `StoreInternal< ALGORITHM, 0 >` | Internal load implementation to use. |
+| `_TempStorage` | `typename InternalStore::TempStorage` | Shared memory storage layout type. |
+
+---
+
+## Member variables
+
+| Name | Type | Description |
+|---|---|---|
+| `IS_ARCH_WARP` static constexpr | `bool` | |
+| `temp_storage` | `_TempStorage &` | |
+| `linear_tid` | `int` | |
+
+---
+
+## Inner classes
+
+### StoreInternal
+
+
+```cpp showLineNumbers={false}
+struct cub::WarpStore::StoreInternal
+```
+
+
+Store helper.
+
+### StoreInternal< WARP_STORE_DIRECT, DUMMY >
+
+
+```cpp showLineNumbers={false}
+struct cub::WarpStore::StoreInternal< WARP_STORE_DIRECT, DUMMY >
+```
+
+
+| Name | Type | Description |
+|---|---|---|
+| `linear_tid` | `int` | |
+
+### StoreInternal< WARP_STORE_STRIPED, DUMMY >
+
+
+```cpp showLineNumbers={false}
+struct cub::WarpStore::StoreInternal< WARP_STORE_STRIPED, DUMMY >
+```
+
+
+| Name | Type | Description |
+|---|---|---|
+| `linear_tid` | `int` | |
+
+### StoreInternal< WARP_STORE_VECTORIZE, DUMMY >
+
+
+```cpp showLineNumbers={false}
+struct cub::WarpStore::StoreInternal< WARP_STORE_VECTORIZE, DUMMY >
+```
+
+
+| Name | Type | Description |
+|---|---|---|
+| `linear_tid` | `int` | |
+
+### StoreInternal< WARP_STORE_TRANSPOSE, DUMMY >
+
+
+```cpp showLineNumbers={false}
+struct cub::WarpStore::StoreInternal< WARP_STORE_TRANSPOSE, DUMMY >
+```
+
+
+| Name | Type | Description |
+|---|---|---|
+| `temp_storage` | `_TempStorage &` | |
+| `linear_tid` | `int` | |
+
+### TempStorage
+
+
+```cpp showLineNumbers={false}
+struct cub::WarpStore::TempStorage
+```
+
+
+**Inherits from:** `Uninitialized< _TempStorage >` (public)
diff --git a/fern/cudapages/cuda/cuda/cuda/arch_traits_t.mdx b/fern/cudapages/cuda/cuda/cuda/arch_traits_t.mdx
new file mode 100644
index 0000000..d1af5f3
--- /dev/null
+++ b/fern/cudapages/cuda/cuda/cuda/arch_traits_t.mdx
@@ -0,0 +1,52 @@
+---
+title: "cuda::arch_traits_t"
+description: "Architecture traits This type contains information about an architecture that is constant across devices of that architecture."
+---
+
+Architecture traits This type contains information about an architecture that is constant across devices of that architecture.
+
+```cpp showLineNumbers={false}
+#include
+```
+
+---
+
+## Member variables
+
+| Name | Type | Description |
+|---|---|---|
+| `max_threads_per_block` | `int` | |
+| `max_block_dim_x` | `int` | |
+| `max_block_dim_y` | `int` | |
+| `max_block_dim_z` | `int` | |
+| `max_grid_dim_x` | `int` | |
+| `max_grid_dim_y` | `int` | |
+| `max_grid_dim_z` | `int` | |
+| `max_shared_memory_per_block` | `::cuda::std::size_t` | |
+| `total_constant_memory` | `::cuda::std::size_t` | |
+| `warp_size` | `int` | |
+| `max_resident_grids` | `int` | |
+| `gpu_overlap` | `bool` | |
+| `can_map_host_memory` | `bool` | |
+| `concurrent_kernels` | `bool` | |
+| `stream_priorities_supported` | `bool` | |
+| `global_l1_cache_supported` | `bool` | |
+| `local_l1_cache_supported` | `bool` | |
+| `max_registers_per_block` | `int` | |
+| `max_registers_per_multiprocessor` | `int` | |
+| `max_registers_per_thread` | `int` | |
+| `arch_id` | `::cuda::arch_id` | |
+| `compute_capability_major` | `int` | |
+| `compute_capability_minor` | `int` | |
+| `compute_capability` | `::cuda::compute_capability` | |
+| `max_shared_memory_per_multiprocessor` | `::cuda::std::size_t` | |
+| `max_blocks_per_multiprocessor` | `int` | |
+| `max_threads_per_multiprocessor` | `int` | |
+| `max_warps_per_multiprocessor` | `int` | |
+| `reserved_shared_memory_per_block` | `::cuda::std::size_t` | |
+| `max_shared_memory_per_block_optin` | `::cuda::std::size_t` | |
+| `cluster_supported` | `bool` | |
+| `redux_intrinisic` | `bool` | |
+| `elect_intrinsic` | `bool` | |
+| `cp_async_supported` | `bool` | |
+| `tma_supported` | `bool` | |
diff --git a/fern/cudapages/cuda/cuda/cuda/buffer.mdx b/fern/cudapages/cuda/cuda/cuda/buffer.mdx
new file mode 100644
index 0000000..a967d86
--- /dev/null
+++ b/fern/cudapages/cuda/cuda/cuda/buffer.mdx
@@ -0,0 +1,723 @@
+---
+title: "cuda::buffer"
+description: ""
+---
+
+`buffer` is a container that provides resizable typed storage allocated from a given memory resource. It handles alignment, release and growth of the allocation. The elements are initialized during construction, which may require a kernel launch.
+
+In addition to being type-safe, `buffer` also takes a set of properties to ensure that e.g. execution space constraints are checked at compile time. However, only stateless properties can be forwarded. To use a stateful property, implement get_property(const buffer&, Property).
+
+```cpp showLineNumbers={false}
+#include
+```
+
+
+
+
+
+The type to be stored in the buffer
+
+
+
+The properties the allocated memory satisfies
+
+
+
+
+
+---
+
+## Constructors
+
+### buffer inline
+
+
+
+
+explicit
+
+Copy-constructs from a buffer.
+
+
+```cpp showLineNumbers={false}
+cuda::buffer<_Tp, _Properties>::buffer(
+ const buffer &__other
+)
+```
+
+
+**Parameters**
+
+
+The other buffer.
+
+
+
+
+
+noexcept
+
+Move-constructs from a buffer.
+
+
+```cpp showLineNumbers={false}
+cuda::buffer<_Tp, _Properties>::buffer(
+ buffer &&__other
+) noexcept
+```
+
+
+**Parameters**
+
+
+The other buffer. After move construction, the other buffer can only be assigned to or destroyed.
+
+
+
+
+
+explicit
+
+Copy-constructs from a buffer with matching properties.
+
+
+```cpp showLineNumbers={false}
+template
+cuda::buffer<_Tp, _Properties>::buffer(
+ const buffer<_Tp, _OtherProperties...> &__other
+)
+```
+
+
+**Parameters**
+
+
+The other buffer.
+
+
+
+
+
+noexcept
+
+Move-constructs from a buffer with matching properties.
+
+
+```cpp showLineNumbers={false}
+template
+cuda::buffer<_Tp, _Properties>::buffer(
+ buffer<_Tp, _OtherProperties...> &&__other
+) noexcept
+```
+
+
+**Parameters**
+
+
+The other buffer. After move construction, the other buffer can only be assigned to or destroyed.
+
+
+
+
+
+Constructs an empty buffer using an environment.
+
+
+```cpp showLineNumbers={false}
+template
+cuda::buffer<_Tp, _Properties>::buffer(
+ ::cuda::stream_ref __stream,
+ _Resource &&__resource,
+ const _Env &__env = {}
+)
+```
+
+
+
+No memory is allocated.
+
+
+**Parameters**
+
+
+The environment providing the needed information
+
+
+
+
+
+explicit
+
+Constructs a buffer of size `__size` using a memory and leaves all elements uninitialized.
+
+
+```cpp showLineNumbers={false}
+template
+cuda::buffer<_Tp, _Properties>::buffer(
+ ::cuda::stream_ref __stream,
+ _Resource &&__resource,
+ const size_type __size,
+ ::cuda::no_init_t,
+ const _Env &__env = {}
+)
+```
+
+
+
+This constructor does *NOT* initialize any elements. It is the user's responsibility to ensure that the elements within `[vec.begin(), vec.end())` are properly initialized, e.g with `cuda::std::uninitialized_copy`. At the destruction of the `buffer` all elements in the range `[vec.begin(), vec.end())` will be destroyed.
+
+
+**Parameters**
+
+
+The size of the buffer.
+
+
+
+The environment used to query the memory resource.
+
+
+
+
+
+Constructs a buffer using a memory resource and copy-constructs all elements from the forward range `[__first, __last)`.
+
+
+```cpp showLineNumbers={false}
+template
+cuda::buffer<_Tp, _Properties>::buffer(
+ ::cuda::stream_ref __stream,
+ _Resource &&__resource,
+ _Iter __first,
+ _Iter __last,
+ const _Env &__env = {}
+)
+```
+
+
+
+If `__first == __last` then no memory is allocated
+
+
+**Parameters**
+
+
+The start of the input sequence.
+
+
+
+The end of the input sequence.
+
+
+
+The environment used to query the memory resource.
+
+
+
+
+
+Constructs a buffer using a memory resource and copy-constructs all elements from `__ilist`.
+
+
+```cpp showLineNumbers={false}
+template
+cuda::buffer<_Tp, _Properties>::buffer(
+ ::cuda::stream_ref __stream,
+ _Resource &&__resource,
+ ::cuda::std::initializer_list<_Tp> __ilist,
+ const _Env &__env = {}
+)
+```
+
+
+
+If `__ilist.size() == 0` then no memory is allocated
+
+
+**Parameters**
+
+
+The initializer_list being copied into the buffer.
+
+
+
+The environment used to query the memory resource.
+
+
+
+
+
+Constructs a buffer using a memory resource and an input range.
+
+
+```cpp showLineNumbers={false}
+template
+cuda::buffer<_Tp, _Properties>::buffer(
+ ::cuda::stream_ref __stream,
+ _Resource &&__resource,
+ _Range &&__range,
+ const _Env &__env = {}
+)
+```
+
+
+
+If `__range.size() == 0` then no memory is allocated.
+
+
+**Parameters**
+
+
+The input range to be moved into the buffer.
+
+
+
+The environment used to query the memory resource.
+
+
+
+
+
+---
+
+## Assignment operators
+
+### operator= inline
+
+Move assignment operator.
+
+
+```cpp showLineNumbers={false}
+void cuda::buffer<_Tp, _Properties>::operator=(
+ buffer &&__other
+)
+```
+
+
+**Parameters**
+
+
+The other buffer. After move assignment, the other buffer can only be assigned to or destroyed.
+
+
+---
+
+## Methods
+
+### begin inline noexcept nodiscard
+
+
+
+
+Returns an iterator to the first element of the buffer.
+
+If the buffer is empty, the returned iterator will be equal to [end()](/libcudacxx/api/cuda::buffer::end()).
+
+
+```cpp showLineNumbers={false}
+iterator cuda::buffer<_Tp, _Properties>::begin() noexcept
+```
+
+
+
+
+
+const
+
+Returns an immutable iterator to the first element of the buffer.
+
+If the buffer is empty, the returned iterator will be equal to [end()](/libcudacxx/api/cuda::buffer::end()).
+
+
+```cpp showLineNumbers={false}
+const_iterator cuda::buffer<_Tp, _Properties>::begin() const noexcept
+```
+
+
+
+
+
+### cbegin inline const noexcept nodiscard
+
+Returns an immutable iterator to the first element of the buffer.
+
+If the buffer is empty, the returned iterator will be equal to [end()](/libcudacxx/api/cuda::buffer::end()).
+
+
+```cpp showLineNumbers={false}
+const_iterator cuda::buffer<_Tp, _Properties>::cbegin() const noexcept
+```
+
+
+### end inline noexcept nodiscard
+
+
+
+
+Returns an iterator to the element following the last element of the buffer.
+
+This element acts as a placeholder; attempting to access it results in undefined behavior.
+
+
+```cpp showLineNumbers={false}
+iterator cuda::buffer<_Tp, _Properties>::end() noexcept
+```
+
+
+
+
+
+const
+
+Returns an immutable iterator to the element following the last element of the buffer.
+
+This element acts as a placeholder; attempting to access it results in undefined behavior.
+
+
+```cpp showLineNumbers={false}
+const_iterator cuda::buffer<_Tp, _Properties>::end() const noexcept
+```
+
+
+
+
+
+### cend inline const noexcept nodiscard
+
+Returns an immutable iterator to the element following the last element of the buffer.
+
+This element acts as a placeholder; attempting to access it results in undefined behavior.
+
+
+```cpp showLineNumbers={false}
+const_iterator cuda::buffer<_Tp, _Properties>::cend() const noexcept
+```
+
+
+### rbegin inline noexcept nodiscard
+
+
+
+
+Returns a reverse iterator to the first element of the reversed buffer.
+
+It corresponds to the last element of the non-reversed buffer. If the buffer is empty, the returned iterator is equal to [rend()](/libcudacxx/api/cuda::buffer::rend()).
+
+
+```cpp showLineNumbers={false}
+reverse_iterator cuda::buffer<_Tp, _Properties>::rbegin() noexcept
+```
+
+
+
+
+
+const
+
+Returns an immutable reverse iterator to the first element of the reversed buffer.
+
+It corresponds to the last element of the non-reversed buffer. If the buffer is empty, the returned iterator is equal to [rend()](/libcudacxx/api/cuda::buffer::rend()).
+
+
+```cpp showLineNumbers={false}
+const_reverse_iterator cuda::buffer<_Tp, _Properties>::rbegin() const noexcept
+```
+
+
+
+
+
+### crbegin inline const noexcept nodiscard
+
+Returns an immutable reverse iterator to the first element of the reversed buffer.
+
+It corresponds to the last element of the non-reversed buffer. If the buffer is empty, the returned iterator is equal to [rend()](/libcudacxx/api/cuda::buffer::rend()).
+
+
+```cpp showLineNumbers={false}
+const_reverse_iterator cuda::buffer<_Tp, _Properties>::crbegin() const noexcept
+```
+
+
+### rend inline noexcept nodiscard
+
+
+
+
+Returns a reverse iterator to the element following the last element of the reversed buffer.
+
+It corresponds to the element preceding the first element of the non-reversed buffer. This element acts as a placeholder, attempting to access it results in undefined behavior.
+
+
+```cpp showLineNumbers={false}
+reverse_iterator cuda::buffer<_Tp, _Properties>::rend() noexcept
+```
+
+
+
+
+
+const
+
+Returns an immutable reverse iterator to the element following the last element of the reversed buffer.
+
+It corresponds to the element preceding the first element of the non-reversed buffer. This element acts as a placeholder, attempting to access it results in undefined behavior.
+
+
+```cpp showLineNumbers={false}
+const_reverse_iterator cuda::buffer<_Tp, _Properties>::rend() const noexcept
+```
+
+
+
+
+
+### crend inline const noexcept nodiscard
+
+Returns an immutable reverse iterator to the element following the last element of the reversed buffer.
+
+It corresponds to the element preceding the first element of the non-reversed buffer. This element acts as a placeholder, attempting to access it results in undefined behavior.
+
+
+```cpp showLineNumbers={false}
+const_reverse_iterator cuda::buffer<_Tp, _Properties>::crend() const noexcept
+```
+
+
+### data inline noexcept nodiscard
+
+
+
+
+Returns a pointer to the first element of the buffer.
+
+If the buffer has not allocated memory the pointer will be null.
+
+
+```cpp showLineNumbers={false}
+pointer cuda::buffer<_Tp, _Properties>::data() noexcept
+```
+
+
+
+
+
+const
+
+Returns a pointer to the first element of the buffer.
+
+If the buffer has not allocated memory the pointer will be null.
+
+
+```cpp showLineNumbers={false}
+const_pointer cuda::buffer<_Tp, _Properties>::data() const noexcept
+```
+
+
+
+
+
+### get_unsynchronized inline noexcept nodiscard
+
+
+
+
+Returns a reference to the `__n` 'th element of the async_vector.
+
+
+```cpp showLineNumbers={false}
+reference cuda::buffer<_Tp, _Properties>::get_unsynchronized(
+ const size_type __n
+) noexcept
+```
+
+
+
+Does not synchronize with the stored stream
+
+
+**Parameters**
+
+
+The index of the element we want to access
+
+
+
+
+
+const
+
+Returns a reference to the `__n` 'th element of the async_vector.
+
+
+```cpp showLineNumbers={false}
+const_reference cuda::buffer<_Tp, _Properties>::get_unsynchronized(
+ const size_type __n
+) const noexcept
+```
+
+
+
+Does not synchronize with the stored stream
+
+
+**Parameters**
+
+
+The index of the element we want to access
+
+
+
+
+
+### size inline const noexcept nodiscard
+
+Returns the current number of elements stored in the buffer.
+
+
+```cpp showLineNumbers={false}
+size_type cuda::buffer<_Tp, _Properties>::size() const noexcept
+```
+
+
+### empty inline const noexcept nodiscard
+
+Returns true if the buffer is empty.
+
+
+```cpp showLineNumbers={false}
+bool cuda::buffer<_Tp, _Properties>::empty() const noexcept
+```
+
+
+### memory_resource inline const noexcept nodiscard
+
+Returns a \c const reference to the any_resource that holds the memory resource used to allocate the buffer
+
+
+```cpp showLineNumbers={false}
+const __resource_t & cuda::buffer<_Tp, _Properties>::memory_resource() const noexcept
+```
+
+
+### stream inline constexpr const noexcept nodiscard
+
+Returns the stored stream.
+
+
+```cpp showLineNumbers={false}
+stream_ref cuda::buffer<_Tp, _Properties>::stream() const noexcept
+```
+
+
+
+Stream used to allocate the buffer is initially stored in the buffer, but can be changed with [`set_stream`](/libcudacxx/api/cuda::buffer::set_stream)
+
+
+### set_stream inline constexpr
+
+Replaces the stored stream.
+
+
+```cpp showLineNumbers={false}
+void cuda::buffer<_Tp, _Properties>::set_stream(
+ stream_ref __new_stream
+)
+```
+
+
+
+Always synchronizes with the old stream
+
+
+**Parameters**
+
+
+The new stream
+
+
+### swap inline noexcept
+
+Swaps the contents of a buffer with those of `__other`.
+
+
+```cpp showLineNumbers={false}
+void cuda::buffer<_Tp, _Properties>::swap(
+ buffer &__other
+) noexcept
+```
+
+
+**Parameters**
+
+
+The other buffer.
+
+
+### destroy inline
+
+
+
+
+Destroys the buffer, deallocates the buffer and destroys the memory resource.
+
+
+```cpp showLineNumbers={false}
+void cuda::buffer<_Tp, _Properties>::destroy(
+ ::cuda::stream_ref __stream
+)
+```
+
+
+
+After this explicit destroy call, the buffer can only be assigned to or destroyed.
+
+
+**Parameters**
+
+
+The stream to deallocate the buffer on.
+
+
+
+
+
+Destroys the buffer, deallocates the buffer and destroys the memory resource.
+
+
+```cpp showLineNumbers={false}
+void cuda::buffer<_Tp, _Properties>::destroy()
+```
+
+
+
+Uses the stored stream to deallocate the buffer, equivalent to calling [buffer.destroy](/libcudacxx/api/cuda::buffer::buffer.destroy)([buffer.stream()](/libcudacxx/api/cuda::buffer::buffer.stream()))
+
+
+
+After this explicit destroy call, the buffer can only be assigned to or destroyed.
+
+
+
+
+
+---
+
+## Types
+
+### Typedefs
+
+| Name | Definition |
+|---|---|
+| `value_type` | `_Tp` |
+| `reference` | `_Tp &` |
+| `const_reference` | `const _Tp &` |
+| `pointer` | `_Tp *` |
+| `const_pointer` | `const _Tp *` |
+| `iterator` | `::cuda::heterogeneous_iterator< _Tp, _Properties... >` |
+| `const_iterator` | `::cuda::heterogeneous_iterator< const _Tp, _Properties... >` |
+| `reverse_iterator` | `::cuda::std::reverse_iterator< iterator >` |
+| `const_reverse_iterator` | `::cuda::std::reverse_iterator< const_iterator >` |
+| `size_type` | `::cuda::std::size_t` |
+| `difference_type` | `::cuda::std::ptrdiff_t` |
+| `properties_list` | `::cuda::mr::properties_list< _Properties... >` |
diff --git a/fern/cudapages/cuda/cuda/cuda/compute_capability.mdx b/fern/cudapages/cuda/cuda/cuda/compute_capability.mdx
new file mode 100644
index 0000000..0d5a30f
--- /dev/null
+++ b/fern/cudapages/cuda/cuda/cuda/compute_capability.mdx
@@ -0,0 +1,207 @@
+---
+title: "cuda::compute_capability"
+description: "Type representing the CUDA compute capability."
+---
+
+Type representing the CUDA compute capability.
+
+```cpp showLineNumbers={false}
+#include
+```
+
+---
+
+## Constructors
+
+### compute_capability constexpr noexcept
+
+
+
+
+
+```cpp showLineNumbers={false}
+cuda::compute_capability::compute_capability() noexcept = default
+```
+
+
+
+
+
+inline explicit
+
+Constructs the object from compute capability `__cc`.
+
+The expected format is 10 * major + minor.
+
+
+```cpp showLineNumbers={false}
+cuda::compute_capability::compute_capability(
+ int __cc
+) noexcept
+```
+
+
+**Parameters**
+
+
+Compute capability.
+
+
+
+
+
+inline
+
+Constructs the object by combining the `__major` and `__minor` compute capability.
+
+
+```cpp showLineNumbers={false}
+cuda::compute_capability::compute_capability(
+ int __major,
+ int __minor
+) noexcept
+```
+
+
+**Parameters**
+
+
+The major compute capability.
+
+
+
+The minor compute capability. Must be less than 10.
+
+
+
+
+
+inline explicit
+
+Constructs the object from the architecture id.
+
+
+```cpp showLineNumbers={false}
+cuda::compute_capability::compute_capability(
+ arch_id __arch_id
+) noexcept
+```
+
+
+**Parameters**
+
+
+The architecture id.
+
+
+
+
+
+
+```cpp showLineNumbers={false}
+cuda::compute_capability::compute_capability(
+ const compute_capability &
+) noexcept = default
+```
+
+
+
+
+
+---
+
+## Assignment operators
+
+### operator= constexpr noexcept
+
+
+```cpp showLineNumbers={false}
+compute_capability & cuda::compute_capability::operator=(
+ const compute_capability &__other
+) noexcept = default
+```
+
+
+---
+
+## Methods
+
+### get inline constexpr const noexcept nodiscard
+
+Gets the stored compute capability.
+
+
+```cpp showLineNumbers={false}
+int cuda::compute_capability::get() const noexcept
+```
+
+
+**Returns:** The stored compute capability in format 10 * major + minor.
+
+### major inline constexpr const noexcept nodiscard
+
+Gets the major compute capability.
+
+
+```cpp showLineNumbers={false}
+int cuda::compute_capability::major() const noexcept
+```
+
+
+
+This symbol is deprecated because it collides with major(...) macro defined in <sys/sysmacros.h> and will be removed in next major release. Use cc.major_cap() instead.
+
+
+**Returns:** Major compute capability.
+
+### major_cap inline constexpr const noexcept nodiscard
+
+Gets the major compute capability.
+
+
+```cpp showLineNumbers={false}
+int cuda::compute_capability::major_cap() const noexcept
+```
+
+
+**Returns:** Major compute capability.
+
+### minor inline constexpr const noexcept nodiscard
+
+Gets the minor compute capability.
+
+
+```cpp showLineNumbers={false}
+int cuda::compute_capability::minor() const noexcept
+```
+
+
+
+This symbol is deprecated because it collides with minor(...) macro defined in <sys/sysmacros.h> and will be removed in next major release. Use cc.minor_cap() instead.
+
+
+**Returns:** Minor compute capability. The value is always less than 10.
+
+### minor_cap inline constexpr const noexcept nodiscard
+
+Gets the minor compute capability.
+
+
+```cpp showLineNumbers={false}
+int cuda::compute_capability::minor_cap() const noexcept
+```
+
+
+**Returns:** Minor compute capability. The value is always less than 10.
+
+### operator int inline constexpr explicit const noexcept
+
+Conversion operator to `int`.
+
+
+```cpp showLineNumbers={false}
+cuda::compute_capability::operator int() const noexcept
+```
+
+
+**Returns:** The stored compute capability in format 10 * major + minor.
diff --git a/fern/cudapages/cuda/cuda/cuda/constant_iterator.mdx b/fern/cudapages/cuda/cuda/cuda/constant_iterator.mdx
new file mode 100644
index 0000000..608063b
--- /dev/null
+++ b/fern/cudapages/cuda/cuda/cuda/constant_iterator.mdx
@@ -0,0 +1,255 @@
+---
+title: "cuda::constant_iterator"
+description: "The [`constant_iterator`](/libcudacxx/api/cuda::constant_iterator) class represents an iterator in an infinite sequence of repeated values."
+---
+
+The `constant_iterator` class represents an iterator in an infinite sequence of repeated values.
+
+This iterator is useful for creating a range filled with the same value without explicitly storing it in memory. Using `constant_iterator` saves both memory capacity and bandwidth.
+
+The following code snippet demonstrates how to create a `constant_iterator` whose [`value_type`](/libcudacxx/api/cuda::constant_iterator::value_type) is `int` and whose value is `10`.
+
+```cpp showLineNumbers={false}
+#include
+```
+
+## Example
+
+```cpp showLineNumbers={false}
+#include
+
+cuda::constant_iterator iter(10);
+
+*iter; // returns 10
+iter[0]; // returns 10
+iter[1]; // returns 10
+iter[13]; // returns 10
+
+// and so on...
+```
+
+
+
+
+
+The value type of the `constant_iterator`.
+
+
+
+The index type of the `constant_iterator`. It can optionally be specified, but must satisfy **integer-like**
+
+
+
+
+
+---
+
+## Constructors
+
+### constant_iterator inline constexpr noexcept
+
+
+
+
+
+```cpp showLineNumbers={false}
+template
+cuda::constant_iterator<_Tp, _Index>::constant_iterator() noexcept(::cuda::std::is_nothrow_default_constructible_v< _Tp2 >)
+```
+
+
+
+
+
+Creates a `constant_iterator` from a value.
+
+The index is set to zero
+
+
+```cpp showLineNumbers={false}
+cuda::constant_iterator<_Tp, _Index>::constant_iterator(
+ _Tp __value
+) noexcept(::cuda::std::is_nothrow_move_constructible_v< _Tp >)
+```
+
+
+**Parameters**
+
+
+The value to store in the `constant_iterator`
+
+
+
+
+
+explicit
+
+Creates `constant_iterator` from a value and an index.
+
+
+```cpp showLineNumbers={false}
+template
+cuda::constant_iterator<_Tp, _Index>::constant_iterator(
+ _Tp __value,
+ _Index2 __index
+) noexcept(::cuda::std::is_nothrow_move_constructible_v< _Tp >)
+```
+
+
+**Parameters**
+
+
+The value to store in the `constant_iterator`
+
+
+
+The index in the sequence represented by this `constant_iterator`
+
+
+
+
+
+---
+
+## Methods
+
+### index inline constexpr const noexcept nodiscard
+
+Returns a the current index.
+
+
+```cpp showLineNumbers={false}
+difference_type cuda::constant_iterator<_Tp, _Index>::index() const noexcept
+```
+
+
+### operator* inline constexpr const noexcept nodiscard
+
+Returns a const reference to the stored value.
+
+
+```cpp showLineNumbers={false}
+const _Tp & cuda::constant_iterator<_Tp, _Index>::operator*() const noexcept
+```
+
+
+### operator[] inline constexpr const noexcept nodiscard
+
+Returns a const reference to the stored value.
+
+
+```cpp showLineNumbers={false}
+const _Tp & cuda::constant_iterator<_Tp, _Index>::operator[](
+ difference_type
+) const noexcept
+```
+
+
+### operator++ inline constexpr noexcept
+
+
+
+
+Increments the stored index.
+
+
+```cpp showLineNumbers={false}
+constant_iterator & cuda::constant_iterator<_Tp, _Index>::operator++() noexcept
+```
+
+
+
+
+
+Increments the stored index.
+
+
+```cpp showLineNumbers={false}
+constant_iterator cuda::constant_iterator<_Tp, _Index>::operator++(
+ int
+) noexcept(::cuda::std::is_nothrow_copy_constructible_v< _Tp >)
+```
+
+
+
+
+
+### operator-- inline constexpr noexcept
+
+
+
+
+Decrements the stored index.
+
+
+```cpp showLineNumbers={false}
+constant_iterator & cuda::constant_iterator<_Tp, _Index>::operator--() noexcept
+```
+
+
+
+
+
+Decrements the stored index.
+
+
+```cpp showLineNumbers={false}
+constant_iterator cuda::constant_iterator<_Tp, _Index>::operator--(
+ int
+) noexcept(::cuda::std::is_nothrow_copy_constructible_v< _Tp >)
+```
+
+
+
+
+
+### operator+= inline constexpr noexcept
+
+Advances a `constant_iterator` by a given number of elements.
+
+
+```cpp showLineNumbers={false}
+constant_iterator & cuda::constant_iterator<_Tp, _Index>::operator+=(
+ difference_type __n
+) noexcept
+```
+
+
+**Parameters**
+
+
+The amount of elements to advance
+
+
+### operator-= inline constexpr noexcept
+
+Decrements a `constant_iterator` by a given number of elements.
+
+
+```cpp showLineNumbers={false}
+constant_iterator & cuda::constant_iterator<_Tp, _Index>::operator-=(
+ difference_type __n
+) noexcept
+```
+
+
+**Parameters**
+
+
+The amount of elements to decrement
+
+
+---
+
+## Types
+
+### Typedefs
+
+| Name | Definition |
+|---|---|
+| `iterator_concept` | `::cuda::std::random_access_iterator_tag` |
+| `iterator_category` | `::cuda::std::random_access_iterator_tag` |
+| `value_type` | `_Tp` |
+| `difference_type` | `::cuda::std::ptrdiff_t` |
+| `reference` | `_Tp` |
+| `pointer` | `void` |
diff --git a/fern/cudapages/cuda/cuda/cuda/copy_configuration.mdx b/fern/cudapages/cuda/cuda/cuda/copy_configuration.mdx
new file mode 100644
index 0000000..8939919
--- /dev/null
+++ b/fern/cudapages/cuda/cuda/cuda/copy_configuration.mdx
@@ -0,0 +1,20 @@
+---
+title: "cuda::copy_configuration"
+description: "Configuration for copy_bytes."
+---
+
+Configuration for copy_bytes.
+
+```cpp showLineNumbers={false}
+#include
+```
+
+---
+
+## Member variables
+
+| Name | Type | Description |
+|---|---|---|
+| `src_location_hint` | `memory_location` | Source memory location hint for copy_bytes, used only for managed memory. |
+| `dst_location_hint` | `memory_location` | Destination memory location hint for copy_bytes, used only for managed memory. |
+| `src_access_order` | `source_access_order` | Source access order for copy_bytes. |
diff --git a/fern/cudapages/cuda/cuda/cuda/counting_iterator.mdx b/fern/cudapages/cuda/cuda/cuda/counting_iterator.mdx
new file mode 100644
index 0000000..ad3924d
--- /dev/null
+++ b/fern/cudapages/cuda/cuda/cuda/counting_iterator.mdx
@@ -0,0 +1,232 @@
+---
+title: "cuda::counting_iterator"
+description: "A [`counting_iterator`](/libcudacxx/api/cuda::counting_iterator) represents an iterator into a range of sequentially increasing values."
+---
+
+A `counting_iterator` represents an iterator into a range of sequentially increasing values.
+
+This iterator is useful for creating a range filled with a sequence without explicitly storing it in memory. Using `counting_iterator` saves memory capacity and bandwidth.
+
+The following code snippet demonstrates how to create a `counting_iterator` whose [`value_type`](/libcudacxx/api/cuda::counting_iterator::value_type) is `int`
+
+```cpp showLineNumbers={false}
+#include
+```
+
+## Example
+
+```cpp showLineNumbers={false}
+#include
+...
+// create iterators
+cuda::counting_iterator first(10);
+cuda::counting_iterator last = first + 3;
+
+first[0] // returns 10
+first[1] // returns 11
+first[100] // returns 110
+
+// sum of [first, last)
+std::reduce(first, last); // returns 33 (i.e. 10 + 11 + 12)
+
+// initialize vector to [0,1,2,..]
+cuda::counting_iterator iter(0);
+std::vector vec(500);
+std::copy(iter, iter + vec.size(), vec.begin());
+```
+
+
+
+
+
+The value type of the `counting_iterator`.
+
+
+
+
+
+**Inherits from:** `__counting_iterator_category< _Start >` (public)
+
+---
+
+## Constructors
+
+### counting_iterator inline constexpr noexcept
+
+
+
+
+
+```cpp showLineNumbers={false}
+template
+cuda::counting_iterator<_Start,,>::counting_iterator() noexcept(::cuda::std::is_nothrow_default_constructible_v< _Start2 >)
+```
+
+
+
+
+
+explicit
+
+Creates a `counting_iterator` from an initial value.
+
+
+```cpp showLineNumbers={false}
+cuda::counting_iterator<_Start,,>::counting_iterator(
+ _Start __value
+) noexcept(::cuda::std::is_nothrow_move_constructible_v< _Start >)
+```
+
+
+**Parameters**
+
+
+The value to store in the `counting_iterator`
+
+
+
+
+
+---
+
+## Methods
+
+### operator* inline constexpr const noexcept nodiscard
+
+Returns the value currently stored in the `counting_iterator`.
+
+
+```cpp showLineNumbers={false}
+_Start cuda::counting_iterator<_Start,,>::operator*() const noexcept(::cuda::std::is_nothrow_copy_constructible_v< _Start >)
+```
+
+
+### operator[] inline constexpr const noexcept nodiscard
+
+Returns the value currently stored in the `counting_iterator` advanced by a number of steps.
+
+
+```cpp showLineNumbers={false}
+template
+_Start2 cuda::counting_iterator<_Start,,>::operator[](
+ difference_type __n
+) const noexcept(::cuda::std::is_nothrow_copy_constructible_v< _Start2 > &&noexcept(::cuda::std::declval< const _Start2 & >()+__n))
+```
+
+
+**Parameters**
+
+
+The amount of elements to advance
+
+
+### operator++ inline constexpr noexcept
+
+
+
+
+Increments the stored value.
+
+
+```cpp showLineNumbers={false}
+counting_iterator & cuda::counting_iterator<_Start,,>::operator++() noexcept(++::cuda::std::declval< _Start & >())
+```
+
+
+
+
+
+Increments the stored value.
+
+
+```cpp showLineNumbers={false}
+auto cuda::counting_iterator<_Start,,>::operator++(
+ int
+) noexcept(noexcept(++::cuda::std::declval< _Start & >()) &&::cuda::std::is_nothrow_copy_constructible_v< _Start >)
+```
+
+
+
+
+
+### operator-- inline constexpr noexcept
+
+
+
+
+Decrements the stored value.
+
+
+```cpp showLineNumbers={false}
+template
+counting_iterator & cuda::counting_iterator<_Start,,>::operator--() noexcept(--::cuda::std::declval< _Start2 & >())
+```
+
+
+
+
+
+Decrements the stored value.
+
+
+```cpp showLineNumbers={false}
+template
+counting_iterator cuda::counting_iterator<_Start,,>::operator--(
+ int
+) noexcept(noexcept(--::cuda::std::declval< _Start2 & >()) &&::cuda::std::is_nothrow_copy_constructible_v< _Start >)
+```
+
+
+
+
+
+### operator+= inline constexpr noexcept
+
+Increments the stored value by a given number of elements.
+
+
+```cpp showLineNumbers={false}
+counting_iterator & cuda::counting_iterator<_Start,,>::operator+=(
+ difference_type __n
+) noexcept(::cuda::std::__integer_like< _Start >)
+```
+
+
+**Parameters**
+
+
+The number of elements to increment
+
+
+### operator-= inline constexpr noexcept
+
+Decrements the stored value by a given number of elements.
+
+
+```cpp showLineNumbers={false}
+template
+counting_iterator & cuda::counting_iterator<_Start,,>::operator-=(
+ difference_type __n
+) noexcept(::cuda::std::__integer_like< _Start2 >)
+```
+
+
+**Parameters**
+
+
+The amount of elements to decrement
+
+
+---
+
+## Types
+
+### Typedefs
+
+| Name | Definition |
+|---|---|
+| `iterator_concept` | `::cuda::std::conditional_t< __advanceable< _Start >, ::cuda::std::random_access_iterator_tag, ::cuda::std::conditional_t< __decrementable< _Start >, ::cuda::std::bidirectional_iterator_tag, ::cuda::std::conditional_t<::cuda::std::incrementable< _Start >, ::cuda::std::forward_iterator_tag, ::cuda::std::input_iterator_tag > > >` |
+| `value_type` | `_Start` |
+| `difference_type` | `_IotaDiffT< _Start >` |
+| `reference` | `_Start` |
+| `pointer` | `void` |
diff --git a/fern/cudapages/cuda/cuda/cuda/device_attributes/compute_capability_t.mdx b/fern/cudapages/cuda/cuda/cuda/device_attributes/compute_capability_t.mdx
new file mode 100644
index 0000000..00f387c
--- /dev/null
+++ b/fern/cudapages/cuda/cuda/cuda/device_attributes/compute_capability_t.mdx
@@ -0,0 +1,32 @@
+---
+title: "cuda::device_attributes::compute_capability_t"
+description: ""
+---
+
+```cpp showLineNumbers={false}
+#include
+```
+
+---
+
+## Methods
+
+### operator() inline const nodiscard
+
+
+```cpp showLineNumbers={false}
+type cuda::device_attributes::compute_capability_t::operator()(
+ device_ref __dev_id
+) const
+```
+
+
+---
+
+## Types
+
+### Typedefs
+
+| Name | Definition |
+|---|---|
+| `type` | `::cuda::compute_capability` |
diff --git a/fern/cudapages/cuda/cuda/cuda/device_memory_pool.mdx b/fern/cudapages/cuda/cuda/cuda/device_memory_pool.mdx
new file mode 100644
index 0000000..bdbb564
--- /dev/null
+++ b/fern/cudapages/cuda/cuda/cuda/device_memory_pool.mdx
@@ -0,0 +1,140 @@
+---
+title: "cuda::device_memory_pool"
+description: ""
+---
+
+`device_memory_pool` allocates device memory using `cudaMallocFromPoolAsync / cudaFreeAsync
+<https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY__POOLS.html>`__
+for allocation/deallocation. When constructed it creates an underlying \c cudaMemPool_t with the location type set to \c cudaMemLocationTypeDevice and owns it.
+
+```cpp showLineNumbers={false}
+#include
+```
+
+**Inherits from:** `cuda::device_memory_pool_ref` (public)
+
+---
+
+## Constructors
+
+### device_memory_pool inline
+
+
+
+
+Constructs a `device_memory_pool` with the optionally specified initial pool size and release threshold.
+
+If the pool size grows beyond the release threshold, unused memory held by the pool will be released at the next synchronization event.
+
+
+```cpp showLineNumbers={false}
+cuda::device_memory_pool::device_memory_pool(
+ ::cuda::device_ref __device_id,
+ memory_pool_properties __properties = {}
+)
+```
+
+
+**Throws:** `cuda_error` if the CUDA version does not support `cudaMallocAsync`.
+
+**Parameters**
+
+
+The device id of the device the stream pool is constructed on.
+
+
+
+Optional, additional properties of the pool to be created.
+
+
+
+
+
+noexcept
+
+
+```cpp showLineNumbers={false}
+cuda::device_memory_pool::device_memory_pool(
+ ::cudaMemPool_t __pool
+) noexcept
+```
+
+
+
+
+
+
+```cpp showLineNumbers={false}
+cuda::device_memory_pool::device_memory_pool(
+ const device_memory_pool &
+) = delete
+```
+
+
+
+
+
+### Destructor
+
+### ~device_memory_pool inline noexcept
+
+
+```cpp showLineNumbers={false}
+cuda::device_memory_pool::~device_memory_pool() noexcept
+```
+
+
+---
+
+## Assignment operators
+
+### operator=
+
+
+```cpp showLineNumbers={false}
+device_memory_pool & cuda::device_memory_pool::operator=(
+ const device_memory_pool &
+) = delete
+```
+
+
+---
+
+## Methods
+
+### as_ref inline noexcept nodiscard
+
+Returns a [`device_memory_pool_ref`](/libcudacxx/api/cuda::device_memory_pool_ref) for this `device_memory_pool`.
+
+We return by reference to ensure that we can subsequently convert to a resource_ref
+
+
+```cpp showLineNumbers={false}
+device_memory_pool_ref & cuda::device_memory_pool::as_ref() noexcept
+```
+
+
+---
+
+## Static methods
+
+### from_native_handle inline static noexcept
+
+
+```cpp showLineNumbers={false}
+static device_memory_pool cuda::device_memory_pool::from_native_handle(
+ ::cudaMemPool_t __pool
+) noexcept
+```
+
+
+---
+
+## Types
+
+### Typedefs
+
+| Name | Definition |
+|---|---|
+| `reference_type` | `device_memory_pool_ref` |
+| `default_queries` | `::cuda::mr::properties_list<::cuda::mr::device_accessible >` |
diff --git a/fern/cudapages/cuda/cuda/cuda/device_memory_pool_ref.mdx b/fern/cudapages/cuda/cuda/cuda/device_memory_pool_ref.mdx
new file mode 100644
index 0000000..b64f489
--- /dev/null
+++ b/fern/cudapages/cuda/cuda/cuda/device_memory_pool_ref.mdx
@@ -0,0 +1,68 @@
+---
+title: "cuda::device_memory_pool_ref"
+description: ""
+---
+
+`device_memory_pool_ref` allocates device memory using `cudaMallocFromPoolAsync / cudaFreeAsync
+<https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY__POOLS.html>`__
+for allocation/deallocation. A `device_memory_pool_ref` is a thin wrapper around a \c cudaMemPool_t with the location type set to \c cudaMemLocationTypeDevice.
+
+.. warning::
+
+ `device_memory_pool_ref` does not own the pool and it is the responsibility of the user to ensure that the lifetime of the pool exceeds the lifetime of the `device_memory_pool_ref`.
+
+```cpp showLineNumbers={false}
+#include
+```
+
+**Inherits from:** `cuda::__memory_pool_base` (public)
+
+---
+
+## Constructors
+
+### device_memory_pool_ref inline explicit noexcept
+
+
+
+
+Constructs the `device_memory_pool_ref` from a `cudaMemPool_t`.
+
+
+```cpp showLineNumbers={false}
+cuda::device_memory_pool_ref::device_memory_pool_ref(
+ ::cudaMemPool_t __pool
+) noexcept
+```
+
+
+**Parameters**
+
+
+The `cudaMemPool_t` used to allocate memory.
+
+
+
+
+
+The following overloads are deleted to prevent misuse:
+
+
+```cpp showLineNumbers={false}
+cuda::device_memory_pool_ref::device_memory_pool_ref(int) = delete;
+cuda::device_memory_pool_ref::device_memory_pool_ref(::cuda::std::nullptr_t) = delete;
+```
+
+
+
+
+
+---
+
+## Types
+
+### Typedefs
+
+| Name | Definition |
+|---|---|
+| `default_queries` | `::cuda::mr::properties_list<::cuda::mr::device_accessible >` |
diff --git a/fern/cudapages/cuda/cuda/cuda/device_ref.mdx b/fern/cudapages/cuda/cuda/cuda/device_ref.mdx
new file mode 100644
index 0000000..beefd1b
--- /dev/null
+++ b/fern/cudapages/cuda/cuda/cuda/device_ref.mdx
@@ -0,0 +1,155 @@
+---
+title: "cuda::device_ref"
+description: "A non-owning representation of a CUDA device."
+---
+
+A non-owning representation of a CUDA device.
+
+```cpp showLineNumbers={false}
+#include
+```
+
+---
+
+## Constructors
+
+### device_ref inline constexpr noexcept
+
+Create a `device_ref` object from a native device ordinal.
+
+
+```cpp showLineNumbers={false}
+cuda::device_ref::device_ref(
+ int __id
+) noexcept
+```
+
+
+---
+
+## Methods
+
+### get inline constexpr const noexcept nodiscard
+
+Retrieve the native ordinal of the `device_ref`.
+
+
+```cpp showLineNumbers={false}
+int cuda::device_ref::get() const noexcept
+```
+
+
+**Returns:** int The native device ordinal held by the `device_ref` object
+
+### attribute inline const nodiscard
+
+
+
+
+Retrieve the specified attribute for the device.
+
+
+```cpp showLineNumbers={false}
+template
+auto cuda::device_ref::attribute(
+ _Attr __attr
+) const
+```
+
+
+**Throws:** `cuda_error` if the attribute query fails
+
+**Parameters**
+
+
+The attribute to query. See `device::attrs` for the available attributes.
+
+
+**See also:**
+device::attrs
+
+
+
+
+
+This is an overloaded member function, provided for convenience. It differs from the above function only in what argument(s) it accepts.
+
+
+```cpp showLineNumbers={false}
+template <::cudaDeviceAttr _Attr>
+auto cuda::device_ref::attribute() const
+```
+
+
+
+
+
+### operator memory_location inline const noexcept nodiscard
+
+Retrieve the memory location of this device.
+
+
+```cpp showLineNumbers={false}
+cuda::device_ref::operator memory_location() const noexcept
+```
+
+
+**Returns:** The memory location of this device
+
+### init inline const
+
+Initializes the primary context of the device.
+
+
+```cpp showLineNumbers={false}
+void cuda::device_ref::init() const
+```
+
+
+### name inline const nodiscard
+
+Retrieve the name of this device.
+
+
+```cpp showLineNumbers={false}
+cuda::std::string_view cuda::device_ref::name() const
+```
+
+
+**Returns:** String view containing the name of this device.
+
+### has_peer_access_to inline const nodiscard
+
+Queries if its possible for this device to directly access specified device's memory.
+
+If this function returns true, device supplied to this call can be passed into enable_peer_access on memory resource or pool that manages memory on this device. It will make allocations from that pool accessible by this device.
+
+
+```cpp showLineNumbers={false}
+bool cuda::device_ref::has_peer_access_to(
+ device_ref __other_dev
+) const
+```
+
+
+**Returns:** true if its possible for this device to access the specified device's memory
+
+**Parameters**
+
+
+Device to query the peer access
+
+
+### peers inline const nodiscard
+
+Retrieve `device_ref`s that are peers of this device.
+
+The device on which this API is called is not included in the vector.
+
+
+```cpp showLineNumbers={false}
+cuda::std::span cuda::device_ref::peers() const
+```
+
+
+**Throws:** `cuda_error` if any peer access query fails
diff --git a/fern/cudapages/cuda/cuda/cuda/discard_iterator.mdx b/fern/cudapages/cuda/cuda/cuda/discard_iterator.mdx
new file mode 100644
index 0000000..01d78da
--- /dev/null
+++ b/fern/cudapages/cuda/cuda/cuda/discard_iterator.mdx
@@ -0,0 +1,264 @@
+---
+title: "cuda::discard_iterator"
+description: "[`discard_iterator`](/libcudacxx/api/cuda::discard_iterator) is an iterator which represents a special kind of pointer that ignores values written to it upon dereference."
+---
+
+`discard_iterator` is an iterator which represents a special kind of pointer that ignores values written to it upon dereference.
+
+This iterator is useful for ignoring the output of certain algorithms without wasting memory capacity or bandwidth. `discard_iterator` may also be used to count the size of an algorithm's output which may not be known a priori.
+
+The following code snippet demonstrates how to use `discard_iterator` to ignore one of the output ranges of reduce_by_key
+
+```cpp showLineNumbers={false}
+#include
+```
+
+## Example
+
+```cpp showLineNumbers={false}
+#include
+#include
+#include
+
+int main()
+{
+ thrust::device_vector keys{1, 3, 3, 3, 2, 2, 1};
+ thrust::device_vector values{9, 8, 7, 6, 5, 4, 3};
+
+ thrust::device_vector result(4);
+
+ // we are only interested in the reduced values
+ // use discard_iterator to ignore the output keys
+ thrust::reduce_by_key(keys.begin(), keys.end(),
+ values.begin(),
+ cuda::discard_iterator{},
+ result.begin());
+
+ // result is now [9, 21, 9, 3]
+
+ return 0;
+}
+```
+
+---
+
+## Constructors
+
+### discard_iterator constexpr
+
+
+
+
+Default constructs a `discard_iterator` at index zero.
+
+
+```cpp showLineNumbers={false}
+cuda::discard_iterator::discard_iterator() = default
+```
+
+
+
+
+
+inline noexcept
+
+Constructs a `discard_iterator` with a given index.
+
+
+```cpp showLineNumbers={false}
+template
+cuda::discard_iterator::discard_iterator(
+ _Integer __index
+) noexcept
+```
+
+
+**Parameters**
+
+
+The index used for the discard iterator
+
+
+
+
+
+---
+
+## Methods
+
+### index inline constexpr const noexcept nodiscard
+
+Returns the stored index.
+
+
+```cpp showLineNumbers={false}
+difference_type cuda::discard_iterator::index() const noexcept
+```
+
+
+### operator* inline constexpr const noexcept nodiscard
+
+Dereferences the `discard_iterator` returning a proxy that discards all values that are assigned to it.
+
+
+```cpp showLineNumbers={false}
+__discard_proxy cuda::discard_iterator::operator*() const noexcept
+```
+
+
+### operator[] inline constexpr const noexcept nodiscard
+
+Subscipts the `discard_iterator` returning a proxy that discards all values that are assigned to it.
+
+
+```cpp showLineNumbers={false}
+__discard_proxy cuda::discard_iterator::operator[](
+ difference_type
+) const noexcept
+```
+
+
+### operator++ inline constexpr noexcept
+
+
+
+
+Increments the stored index.
+
+
+```cpp showLineNumbers={false}
+discard_iterator & cuda::discard_iterator::operator++() noexcept
+```
+
+
+
+
+
+Increments the stored index.
+
+
+```cpp showLineNumbers={false}
+discard_iterator cuda::discard_iterator::operator++(
+ int
+) noexcept
+```
+
+
+
+
+
+### operator-- inline constexpr noexcept
+
+
+
+
+Decrements the stored index.
+
+
+```cpp showLineNumbers={false}
+discard_iterator & cuda::discard_iterator::operator--() noexcept
+```
+
+
+
+
+
+Decrements the stored index.
+
+
+```cpp showLineNumbers={false}
+discard_iterator cuda::discard_iterator::operator--(
+ int
+) noexcept
+```
+
+
+
+
+
+### operator+ inline constexpr const noexcept nodiscard
+
+Returns a copy of this `discard_iterator` advanced by a number of elements.
+
+
+```cpp showLineNumbers={false}
+discard_iterator cuda::discard_iterator::operator+(
+ difference_type __n
+) const noexcept
+```
+
+
+**Parameters**
+
+
+The number of elements to advance
+
+
+### operator+= inline constexpr noexcept
+
+Advances the index of this `discard_iterator` by a number of elements.
+
+
+```cpp showLineNumbers={false}
+discard_iterator & cuda::discard_iterator::operator+=(
+ difference_type __n
+) noexcept
+```
+
+
+**Parameters**
+
+
+The number of elements to advance
+
+
+### operator- inline constexpr const noexcept nodiscard
+
+Returns a copy of this `discard_iterator` decremented by a number of elements.
+
+
+```cpp showLineNumbers={false}
+discard_iterator cuda::discard_iterator::operator-(
+ difference_type __n
+) const noexcept
+```
+
+
+**Parameters**
+
+
+The number of elements to decrement
+
+
+### operator-= inline constexpr noexcept
+
+Decrements the index of the `discard_iterator` by a number of elements.
+
+
+```cpp showLineNumbers={false}
+discard_iterator & cuda::discard_iterator::operator-=(
+ difference_type __n
+) noexcept
+```
+
+
+**Parameters**
+
+
+The number of elements to decrement
+
+
+---
+
+## Types
+
+### Typedefs
+
+| Name | Definition |
+|---|---|
+| `iterator_concept` | `::cuda::std::random_access_iterator_tag` |
+| `iterator_category` | `::cuda::std::random_access_iterator_tag` |
+| `difference_type` | `::cuda::std::ptrdiff_t` |
+| `value_type` | `void` |
+| `pointer` | `void` |
+| `reference` | `void` |
diff --git a/fern/cudapages/cuda/cuda/cuda/event.mdx b/fern/cudapages/cuda/cuda/cuda/event.mdx
new file mode 100644
index 0000000..dd614f1
--- /dev/null
+++ b/fern/cudapages/cuda/cuda/cuda/event.mdx
@@ -0,0 +1,342 @@
+---
+title: "cuda::event"
+description: "An owning wrapper for an untimed `cudaEvent_t`."
+---
+
+An owning wrapper for an untimed `cudaEvent_t`.
+
+```cpp showLineNumbers={false}
+#include
+```
+
+**Inherits from:** `cuda::event_ref` (public)
+
+---
+
+## Constructors
+
+### event inline
+
+
+
+
+explicit
+
+Construct a new `event` object with timing disabled, and record the event in the specified stream.
+
+
+```cpp showLineNumbers={false}
+cuda::event::event(
+ stream_ref __stream,
+ event_flags __flags = event_flags::none
+)
+```
+
+
+**Throws:** `cuda_error` if the event creation fails.
+
+
+
+
+explicit
+
+Construct a new `event` object with timing disabled.
+
+The event can only be recorded on streams from the specified device.
+
+
+```cpp showLineNumbers={false}
+cuda::event::event(
+ device_ref __device,
+ event_flags __flags = event_flags::none
+)
+```
+
+
+**Throws:** `cuda_error` if the event creation fails.
+
+
+
+
+constexpr explicit noexcept
+
+Construct a new `event` object into the moved-from state.
+
+
+```cpp showLineNumbers={false}
+cuda::event::event(
+ no_init_t
+) noexcept
+```
+
+
+
+[`get()`](/libcudacxx/api/cuda::event_ref::get()) returns `cudaEvent_t()`.
+
+
+
+
+
+constexpr noexcept
+
+Move-construct a new `event` object.
+
+
+```cpp showLineNumbers={false}
+cuda::event::event(
+ event &&__other
+) noexcept
+```
+
+
+
+`__other` is in a moved-from state.
+
+
+
+
+
+constexpr explicit noexcept
+
+
+```cpp showLineNumbers={false}
+cuda::event::event(
+ ::cudaEvent_t __evnt
+) noexcept
+```
+
+
+
+
+
+explicit
+
+
+```cpp showLineNumbers={false}
+cuda::event::event(
+ stream_ref __stream,
+ unsigned __flags
+)
+```
+
+
+
+
+
+explicit
+
+
+```cpp showLineNumbers={false}
+cuda::event::event(
+ device_ref __device,
+ unsigned __flags
+)
+```
+
+
+
+
+
+
+```cpp showLineNumbers={false}
+cuda::event::event(
+ const event &
+) = delete
+```
+
+
+
+
+
+### Destructor
+
+### ~event inline
+
+Destroy the `event` object.
+
+
+```cpp showLineNumbers={false}
+cuda::event::~event()
+```
+
+
+
+If the event fails to be destroyed, the error is silently ignored.
+
+
+---
+
+## Assignment operators
+
+### operator= inline noexcept
+
+
+
+
+Move-assign an `event` object.
+
+
+```cpp showLineNumbers={false}
+event & cuda::event::operator=(
+ event &&__other
+) noexcept
+```
+
+
+
+`__other` is in a moved-from state.
+
+
+
+
+
+
+```cpp showLineNumbers={false}
+event & cuda::event::operator=(
+ const event &
+) = delete
+```
+
+
+
+
+
+---
+
+## Methods
+
+### release inline noexcept nodiscard
+
+Retrieve the native `cudaEvent_t` handle and give up ownership.
+
+
+```cpp showLineNumbers={false}
+::cudaEvent_t cuda::event::release() noexcept
+```
+
+
+
+The event object is in a moved-from state.
+
+
+**Returns:** cudaEvent_t The native handle being held by the `event` object.
+
+### record inline const
+
+Records an event on the specified stream.
+
+
+```cpp showLineNumbers={false}
+void cuda::event::record(
+ stream_ref __stream
+) const
+```
+
+
+**Throws:** `cuda_error` if the event record fails
+
+### sync inline const
+
+Synchronizes the event.
+
+
+```cpp showLineNumbers={false}
+void cuda::event::sync() const
+```
+
+
+**Throws:** `cuda_error` if waiting for the event fails
+
+### is_done inline const nodiscard
+
+Checks if all the work in the stream prior to the record of the event has completed.
+
+If is_done returns true, calling [sync()](/libcudacxx/api/cuda::event_ref::sync()) on this event will return immediately
+
+
+```cpp showLineNumbers={false}
+bool cuda::event::is_done() const
+```
+
+
+**Throws:** `cuda_error` if the event query fails
+
+### get inline const noexcept nodiscard
+
+Retrieve the native `cudaEvent_t` handle.
+
+
+```cpp showLineNumbers={false}
+::cudaEvent_t cuda::event::get() const noexcept
+```
+
+
+**Returns:** cudaEvent_t The native handle being held by the [event_ref](/libcudacxx/api/cuda::event_ref) object.
+
+### operator bool inline constexpr explicit const noexcept nodiscard
+
+Checks if the [`event_ref`](/libcudacxx/api/cuda::event_ref) is valid.
+
+
+```cpp showLineNumbers={false}
+cuda::event::operator bool() const noexcept
+```
+
+
+**Returns:** true if the [`event_ref`](/libcudacxx/api/cuda::event_ref) is valid, false otherwise.
+
+---
+
+## Static methods
+
+### from_native_handle inline static noexcept nodiscard
+
+
+
+
+Construct an `event` object from a native `cudaEvent_t` handle.
+
+
+```cpp showLineNumbers={false}
+static event cuda::event::from_native_handle(
+ ::cudaEvent_t __evnt
+) noexcept
+```
+
+
+
+The constructed `event` object takes ownership of the native handle.
+
+
+**Returns:** event The constructed `event` object
+
+**Parameters**
+
+
+The native handle
+
+
+
+
+
+The following overloads are deleted to prevent misuse:
+
+
+```cpp showLineNumbers={false}
+static event cuda::event::from_native_handle(int) = delete;
+static event cuda::event::from_native_handle(::cuda::std::nullptr_t) = delete;
+```
+
+
+
+
+
+---
+
+## Types
+
+### Typedefs
+
+| Name | Definition |
+|---|---|
+| `value_type` | `::cudaEvent_t` |
diff --git a/fern/cudapages/cuda/cuda/cuda/event_ref.mdx b/fern/cudapages/cuda/cuda/cuda/event_ref.mdx
new file mode 100644
index 0000000..66b544d
--- /dev/null
+++ b/fern/cudapages/cuda/cuda/cuda/event_ref.mdx
@@ -0,0 +1,132 @@
+---
+title: "cuda::event_ref"
+description: "An non-owning wrapper for an untimed `cudaEvent_t`."
+---
+
+An non-owning wrapper for an untimed `cudaEvent_t`.
+
+```cpp showLineNumbers={false}
+#include
+```
+
+---
+
+## Constructors
+
+### event_ref inline constexpr noexcept
+
+
+
+
+Construct a new `event_ref` object from a `cudaEvent_t`.
+
+This constructor provides an implicit conversion from `cudaEvent_t`
+
+
+```cpp showLineNumbers={false}
+cuda::event_ref::event_ref(
+ ::cudaEvent_t __evnt
+) noexcept
+```
+
+
+
+: It is the callers responsibility to ensure the `event_ref` does not outlive the event denoted by the `cudaEvent_t` handle.
+
+
+
+[`get()`](/libcudacxx/api/cuda::event_ref::get())` == __evnt`
+
+
+
+
+
+The following overloads are deleted to prevent misuse:
+
+
+```cpp showLineNumbers={false}
+cuda::event_ref::event_ref(int) = delete;
+cuda::event_ref::event_ref(::cuda::std::nullptr_t) = delete;
+```
+
+
+
+
+
+---
+
+## Methods
+
+### record inline const
+
+Records an event on the specified stream.
+
+
+```cpp showLineNumbers={false}
+void cuda::event_ref::record(
+ stream_ref __stream
+) const
+```
+
+
+**Throws:** `cuda_error` if the event record fails
+
+### sync inline const
+
+Synchronizes the event.
+
+
+```cpp showLineNumbers={false}
+void cuda::event_ref::sync() const
+```
+
+
+**Throws:** `cuda_error` if waiting for the event fails
+
+### is_done inline const nodiscard
+
+Checks if all the work in the stream prior to the record of the event has completed.
+
+If is_done returns true, calling [sync()](/libcudacxx/api/cuda::event_ref::sync()) on this event will return immediately
+
+
+```cpp showLineNumbers={false}
+bool cuda::event_ref::is_done() const
+```
+
+
+**Throws:** `cuda_error` if the event query fails
+
+### get inline const noexcept nodiscard
+
+Retrieve the native `cudaEvent_t` handle.
+
+
+```cpp showLineNumbers={false}
+::cudaEvent_t cuda::event_ref::get() const noexcept
+```
+
+
+**Returns:** cudaEvent_t The native handle being held by the `event_ref` object.
+
+### operator bool inline constexpr explicit const noexcept nodiscard
+
+Checks if the `event_ref` is valid.
+
+
+```cpp showLineNumbers={false}
+cuda::event_ref::operator bool() const noexcept
+```
+
+
+**Returns:** true if the `event_ref` is valid, false otherwise.
+
+---
+
+## Types
+
+### Typedefs
+
+| Name | Definition |
+|---|---|
+| `value_type` | `::cudaEvent_t` |
diff --git a/fern/cudapages/cuda/cuda/cuda/get_stream_t.mdx b/fern/cudapages/cuda/cuda/cuda/get_stream_t.mdx
new file mode 100644
index 0000000..2a469b8
--- /dev/null
+++ b/fern/cudapages/cuda/cuda/cuda/get_stream_t.mdx
@@ -0,0 +1,92 @@
+---
+title: "cuda::get_stream_t"
+description: "[`get_stream`](/libcudacxx/api/cuda::get_stream) is a customization point object that queries a type `T` for an associated stream"
+---
+
+[`get_stream`](/libcudacxx/api/cuda::get_stream) is a customization point object that queries a type `T` for an associated stream
+
+```cpp showLineNumbers={false}
+#include
+```
+
+---
+
+## Methods
+
+### operator() inline const noexcept nodiscard
+
+
+
+
+
+```cpp showLineNumbers={false}
+::cuda::stream_ref cuda::get_stream_t::operator()(
+ ::cudaStream_t __stream
+) const noexcept
+```
+
+
+
+
+
+
+```cpp showLineNumbers={false}
+template
+::cuda::stream_ref cuda::get_stream_t::operator()(
+ const _Tp &__t
+) const noexcept(static_cast<::cuda::stream_ref >(__t))
+```
+
+
+
+
+
+
+```cpp showLineNumbers={false}
+template
+::cuda::stream_ref cuda::get_stream_t::operator()(
+ const _Tp &__t
+) const noexcept(__t.stream())
+```
+
+
+
+
+
+
+```cpp showLineNumbers={false}
+template
+::cuda::stream_ref cuda::get_stream_t::operator()(
+ const _Tp &__t
+) const noexcept(__t.get_stream())
+```
+
+
+
+
+
+
+```cpp showLineNumbers={false}
+template
+::cuda::stream_ref cuda::get_stream_t::operator()(
+ const _Env &__env
+) const noexcept
+```
+
+
+
+
+
+---
+
+## Static methods
+
+### query inline static constexpr noexcept nodiscard
+
+
+```cpp showLineNumbers={false}
+static constexpr bool cuda::get_stream_t::query(
+ ::cuda::std::execution::forwarding_query_t
+) noexcept
+```
+
diff --git a/fern/cudapages/cuda/cuda/cuda/has_property.mdx b/fern/cudapages/cuda/cuda/cuda/has_property.mdx
new file mode 100644
index 0000000..fdaa128
--- /dev/null
+++ b/fern/cudapages/cuda/cuda/cuda/has_property.mdx
@@ -0,0 +1,33 @@
+---
+title: "cuda::has_property"
+description: "The `has_property` concept verifies that a Resource satisfies a given Property."
+---
+
+C++20 concept
+
+The `has_property` concept verifies that a Resource satisfies a given Property.
+
+
+```cpp showLineNumbers={false}
+template
+concept has_property = /* see description */;
+```
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+
+## Description
+
+For \c has_property we require the following free function to be callable
diff --git a/fern/cudapages/cuda/cuda/cuda/has_property_with.mdx b/fern/cudapages/cuda/cuda/cuda/has_property_with.mdx
new file mode 100644
index 0000000..4ce85b8
--- /dev/null
+++ b/fern/cudapages/cuda/cuda/cuda/has_property_with.mdx
@@ -0,0 +1,36 @@
+---
+title: "cuda::has_property_with"
+description: "The `has_property_with` concept verifies that a Resource satisfies a given stateful Property."
+---
+
+C++20 concept
+
+The `has_property_with` concept verifies that a Resource satisfies a given stateful Property.
+
+
+```cpp showLineNumbers={false}
+template
+concept has_property_with = /* see description */;
+```
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+
+## Description
+
+For \c has_property_with we require the following free function to be callable and its return type to exactly match the `value_type` of the Property
diff --git a/fern/cudapages/cuda/cuda/cuda/heterogeneous_iterator.mdx b/fern/cudapages/cuda/cuda/cuda/heterogeneous_iterator.mdx
new file mode 100644
index 0000000..0eacc5b
--- /dev/null
+++ b/fern/cudapages/cuda/cuda/cuda/heterogeneous_iterator.mdx
@@ -0,0 +1,272 @@
+---
+title: "cuda::heterogeneous_iterator"
+description: ""
+---
+
+`heterogeneous_iterator` provides a type safe access over heterogeneous memory. Depending on whether the memory is tagged as host-accessible and / or device-accessible the iterator restricts memory access. All operations that do not require memory access are always available on host and device.
+
+```cpp showLineNumbers={false}
+#include
+```
+
+
+
+
+
+
+
+
+The properties that the `heterogeneous_iterator` is tagged with.
+
+
+
+
+
+**Inherits from:** `cuda::__heterogeneous_iterator_access< ::cuda::std::remove_const_t< _CvTp >, ::cuda::std::is_const_v< _CvTp > ? __is_heterogeneous_const_iter::__yes :__is_heterogeneous_const_iter::__no, ::cuda::mr::__memory_accessability_from_properties< _Properties... >::value >` (public)
+
+---
+
+## Constructors
+
+### heterogeneous_iterator
+
+
+
+
+
+```cpp showLineNumbers={false}
+cuda::heterogeneous_iterator<_CvTp, _Properties>::heterogeneous_iterator() = default
+```
+
+
+
+
+
+inline constexpr noexcept
+
+Construct a `heterogeneous_iterator` from a pointer to the underlying memory.
+
+
+```cpp showLineNumbers={false}
+cuda::heterogeneous_iterator<_CvTp, _Properties>::heterogeneous_iterator(
+ pointer __ptr
+) noexcept
+```
+
+
+
+
+
+inline constexpr noexcept
+
+Constructs an immutable `heterogeneous_iterator` from a mutable one.
+
+
+```cpp showLineNumbers={false}
+template
+cuda::heterogeneous_iterator<_CvTp, _Properties>::heterogeneous_iterator(
+ heterogeneous_iterator<_OtherTp, _Properties...> __other
+) noexcept
+```
+
+
+**Parameters**
+
+
+The mutable `heterogeneous_iterator`
+
+
+
+
+
+---
+
+## Methods
+
+### operator++ inline constexpr noexcept
+
+
+
+
+Increment of a `heterogeneous_iterator`.
+
+
+```cpp showLineNumbers={false}
+heterogeneous_iterator & cuda::heterogeneous_iterator<_CvTp, _Properties>::operator++() noexcept
+```
+
+
+**Returns:** The `heterogeneous_iterator` pointing to the next element
+
+
+
+
+Post-increment of a `heterogeneous_iterator`.
+
+
+```cpp showLineNumbers={false}
+heterogeneous_iterator cuda::heterogeneous_iterator<_CvTp, _Properties>::operator++(
+ int
+) noexcept
+```
+
+
+**Returns:** A copy of the `heterogeneous_iterator` pointing to the next element
+
+
+
+
+### operator-- inline constexpr noexcept
+
+
+
+
+Decrement of a `heterogeneous_iterator`.
+
+
+```cpp showLineNumbers={false}
+heterogeneous_iterator & cuda::heterogeneous_iterator<_CvTp, _Properties>::operator--() noexcept
+```
+
+
+**Returns:** The `heterogeneous_iterator` pointing to the previous element
+
+
+
+
+Post-decrement of a `heterogeneous_iterator`.
+
+
+```cpp showLineNumbers={false}
+heterogeneous_iterator cuda::heterogeneous_iterator<_CvTp, _Properties>::operator--(
+ int
+) noexcept
+```
+
+
+**Returns:** A copy of the `heterogeneous_iterator` pointing to the previous element
+
+
+
+
+### operator+= inline constexpr noexcept
+
+Advance a `heterogeneous_iterator`.
+
+
+```cpp showLineNumbers={false}
+heterogeneous_iterator & cuda::heterogeneous_iterator<_CvTp, _Properties>::operator+=(
+ const difference_type __count
+) noexcept
+```
+
+
+**Returns:** The `heterogeneous_iterator` advanced by `__count`
+
+**Parameters**
+
+
+The number of elements to advance.
+
+
+### operator+ inline constexpr const noexcept nodiscard
+
+Advance a `heterogeneous_iterator`.
+
+
+```cpp showLineNumbers={false}
+heterogeneous_iterator cuda::heterogeneous_iterator<_CvTp, _Properties>::operator+(
+ const difference_type __count
+) const noexcept
+```
+
+
+**Returns:** A copy of this `heterogeneous_iterator` advanced by `__count`
+
+**Parameters**
+
+
+The number of elements to advance.
+
+
+### operator-= inline constexpr noexcept
+
+Advance a `heterogeneous_iterator` by the negative value of `__count`.
+
+
+```cpp showLineNumbers={false}
+heterogeneous_iterator & cuda::heterogeneous_iterator<_CvTp, _Properties>::operator-=(
+ const difference_type __count
+) noexcept
+```
+
+
+**Returns:** The `heterogeneous_iterator` advanced by the negative value of `__count`
+
+**Parameters**
+
+
+The number of elements to advance.
+
+
+### operator- inline constexpr const noexcept nodiscard
+
+
+
+
+Advance a `heterogeneous_iterator` by the negative value of `__count`.
+
+
+```cpp showLineNumbers={false}
+heterogeneous_iterator cuda::heterogeneous_iterator<_CvTp, _Properties>::operator-(
+ const difference_type __count
+) const noexcept
+```
+
+
+**Returns:** A copy of this `heterogeneous_iterator` advanced by the negative value of `__count`
+
+**Parameters**
+
+
+The number of elements to advance.
+
+
+
+
+
+Distance between two `heterogeneous_iterator`.
+
+
+```cpp showLineNumbers={false}
+difference_type cuda::heterogeneous_iterator<_CvTp, _Properties>::operator-(
+ const heterogeneous_iterator &__other
+) const noexcept
+```
+
+
+**Returns:** The distance between the two elements the `heterogeneous_iterator` point to
+
+**Parameters**
+
+
+The other `heterogeneous_iterator`.
+
+
+
+
+
+---
+
+## Types
+
+### Typedefs
+
+| Name | Definition |
+|---|---|
+| `iterator_concept` | `::cuda::std::contiguous_iterator_tag` |
+| `iterator_category` | `::cuda::std::random_access_iterator_tag` |
+| `value_type` | `::cuda::std::remove_const_t< _CvTp >` |
+| `difference_type` | `::cuda::std::ptrdiff_t` |
+| `pointer` | `_CvTp *` |
+| `reference` | `_CvTp &` |
diff --git a/fern/cudapages/cuda/cuda/cuda/managed_memory_pool.mdx b/fern/cudapages/cuda/cuda/cuda/managed_memory_pool.mdx
new file mode 100644
index 0000000..2a5f10b
--- /dev/null
+++ b/fern/cudapages/cuda/cuda/cuda/managed_memory_pool.mdx
@@ -0,0 +1,133 @@
+---
+title: "cuda::managed_memory_pool"
+description: ""
+---
+
+`managed_memory_pool` allocates managed memory using `cudaMallocFromPoolAsync / cudaFreeAsync
+<https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY__POOLS.html>`__
+for allocation/deallocation. When constructed it creates an underlying \c cudaMemPool_t with the allocation type set to \c cudaMemAllocationTypeManaged and owns it.
+
+```cpp showLineNumbers={false}
+#include
+```
+
+**Inherits from:** `cuda::managed_memory_pool_ref` (public)
+
+---
+
+## Constructors
+
+### managed_memory_pool inline
+
+
+
+
+Constructs a `managed_memory_pool` with optional properties.
+
+Properties include the initial pool size and the release threshold. If the pool size grows beyond the release threshold, unused memory held by the pool will be released at the next synchronization event.
+
+
+```cpp showLineNumbers={false}
+cuda::managed_memory_pool::managed_memory_pool(
+ memory_pool_properties __properties = {}
+)
+```
+
+
+**Parameters**
+
+
+Optional, additional properties of the pool to be created.
+
+
+
+
+
+noexcept
+
+
+```cpp showLineNumbers={false}
+cuda::managed_memory_pool::managed_memory_pool(
+ ::cudaMemPool_t __pool
+) noexcept
+```
+
+
+
+
+
+
+```cpp showLineNumbers={false}
+cuda::managed_memory_pool::managed_memory_pool(
+ const managed_memory_pool &
+) = delete
+```
+
+
+
+
+
+### Destructor
+
+### ~managed_memory_pool inline noexcept
+
+
+```cpp showLineNumbers={false}
+cuda::managed_memory_pool::~managed_memory_pool() noexcept
+```
+
+
+---
+
+## Assignment operators
+
+### operator=
+
+
+```cpp showLineNumbers={false}
+managed_memory_pool & cuda::managed_memory_pool::operator=(
+ const managed_memory_pool &
+) = delete
+```
+
+
+---
+
+## Methods
+
+### as_ref inline noexcept nodiscard
+
+Returns a [`managed_memory_pool_ref`](/libcudacxx/api/cuda::managed_memory_pool_ref) for this `managed_memory_pool`.
+
+We return by reference to ensure that we can subsequently convert to a resource_ref
+
+
+```cpp showLineNumbers={false}
+managed_memory_pool_ref & cuda::managed_memory_pool::as_ref() noexcept
+```
+
+
+---
+
+## Static methods
+
+### from_native_handle inline static noexcept
+
+
+```cpp showLineNumbers={false}
+static managed_memory_pool cuda::managed_memory_pool::from_native_handle(
+ ::cudaMemPool_t __pool
+) noexcept
+```
+
+
+---
+
+## Types
+
+### Typedefs
+
+| Name | Definition |
+|---|---|
+| `reference_type` | `managed_memory_pool_ref` |
+| `default_queries` | `::cuda::mr::properties_list<::cuda::mr::device_accessible, ::cuda::mr::host_accessible >` |
diff --git a/fern/cudapages/cuda/cuda/cuda/managed_memory_pool_ref.mdx b/fern/cudapages/cuda/cuda/cuda/managed_memory_pool_ref.mdx
new file mode 100644
index 0000000..a46969f
--- /dev/null
+++ b/fern/cudapages/cuda/cuda/cuda/managed_memory_pool_ref.mdx
@@ -0,0 +1,50 @@
+---
+title: "cuda::managed_memory_pool_ref"
+description: ""
+---
+
+`managed_memory_pool_ref` allocates managed memory using `cudaMallocFromPoolAsync / cudaFreeAsync
+<https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY__POOLS.html>`__
+for allocation/deallocation. A `managed_memory_pool_ref` is a thin wrapper around a \c cudaMemPool_t with the allocation type set to \c cudaMemAllocationTypeManaged.
+
+.. warning::
+
+ `managed_memory_pool_ref` does not own the pool and it is the responsibility of the user to ensure that the lifetime of the pool exceeds the lifetime of the `managed_memory_pool_ref`.
+
+```cpp showLineNumbers={false}
+#include
+```
+
+**Inherits from:** `cuda::__memory_pool_base` (public)
+
+---
+
+## Constructors
+
+### managed_memory_pool_ref inline explicit noexcept
+
+Constructs the `managed_memory_pool_ref` from a `cudaMemPool_t`.
+
+
+```cpp showLineNumbers={false}
+cuda::managed_memory_pool_ref::managed_memory_pool_ref(
+ ::cudaMemPool_t __pool
+) noexcept
+```
+
+
+**Parameters**
+
+
+The `cudaMemPool_t` used to allocate memory.
+
+
+---
+
+## Types
+
+### Typedefs
+
+| Name | Definition |
+|---|---|
+| `default_queries` | `::cuda::mr::properties_list<::cuda::mr::device_accessible, ::cuda::mr::host_accessible >` |
diff --git a/fern/cudapages/cuda/cuda/cuda/memory_pool_properties.mdx b/fern/cudapages/cuda/cuda/cuda/memory_pool_properties.mdx
new file mode 100644
index 0000000..7811a78
--- /dev/null
+++ b/fern/cudapages/cuda/cuda/cuda/memory_pool_properties.mdx
@@ -0,0 +1,23 @@
+---
+title: "cuda::memory_pool_properties"
+description: "[`memory_pool_properties`](/libcudacxx/api/cuda::memory_pool_properties) is a type that can controls memory pool to control the creation options."
+---
+
+`memory_pool_properties` is a type that can controls memory pool to control the creation options.
+
+Compared to attributes, properties can not be set after the pool is created.
+
+```cpp showLineNumbers={false}
+#include
+```
+
+---
+
+## Member variables
+
+| Name | Type | Description |
+|---|---|---|
+| `initial_pool_size` | `size_t` | |
+| `release_threshold` | `size_t` | |
+| `allocation_handle_type` | `cudaMemAllocationHandleType` | |
+| `max_pool_size` | `size_t` | |
diff --git a/fern/cudapages/cuda/cuda/cuda/mr/basic_any_resource.mdx b/fern/cudapages/cuda/cuda/cuda/mr/basic_any_resource.mdx
new file mode 100644
index 0000000..fe8ce07
--- /dev/null
+++ b/fern/cudapages/cuda/cuda/cuda/mr/basic_any_resource.mdx
@@ -0,0 +1,591 @@
+---
+title: "cuda::mr::basic_any_resource"
+description: ""
+---
+
+`basic_any_resource` wraps any given resource that satisfies the required properties. It owns the contained resource, taking care of construction / destruction. This makes it especially suited for use in e.g. container types that need to ensure that the lifetime of the container exceeds the lifetime of the memory resource used to allocate the storage
+
+`basic_any_resource` models the `cuda::std::regular` concept.
+
+```cpp showLineNumbers={false}
+#include
+```
+
+**See also:**
+[any_synchronous_resource](/library/api/cuda::mr::any_synchronous_resource),
+[any_resource](/library/api/cuda::mr::any_resource),
+[synchronous_resource_ref](/library/api/cuda::mr::synchronous_resource_ref),
+[resource_ref](/library/api/cuda::mr::resource_ref)
+
+
+
+
+
+Either [`_ResourceKind::_Synchronous`](/library/api/cuda::mr::_Synchronous) for [`any_synchronous_resource`](/library/api/cuda::mr::any_synchronous_resource), or [`_ResourceKind::_Asynchronous`](/library/api/cuda::mr::_Asynchronous) for [`any_resource`](/library/api/cuda::mr::any_resource).
+
+
+
+A pack of property types that a memory resource must provide in order to be storable in instances of this `basic_any_resource` type.
+
+
+
+
+
+---
+
+## Constructors
+
+### basic_any_resource
+
+
+
+
+noexcept
+
+Constructs a `basic_any_resource` with no value.
+
+
+```cpp showLineNumbers={false}
+cuda::mr::basic_any_resource<_Kind, _Properties>::basic_any_resource() noexcept
+```
+
+
+
+[`has_value()`](/library/api/cuda::mr::basic_any_resource::has_value()) is `false`
+
+
+
+
+
+noexcept
+
+Move constructs a `basic_any_resource`.
+
+
+```cpp showLineNumbers={false}
+cuda::mr::basic_any_resource<_Kind, _Properties>::basic_any_resource(
+ basic_any_resource &&__other
+) noexcept
+```
+
+
+
+[`has_value()`](/library/api/cuda::mr::basic_any_resource::has_value()) is `true` if `__other` had a value prior to the move, and `false` otherwise. `__other.has_value()` is `false`.
+
+
+
+
+
+Copy constructs a `basic_any_resource`.
+
+
+```cpp showLineNumbers={false}
+cuda::mr::basic_any_resource<_Kind, _Properties>::basic_any_resource(
+ const basic_any_resource &__other
+)
+```
+
+
+
+[`has_value()`](/library/api/cuda::mr::basic_any_resource::has_value()) is the same as `__other.has_value()`.
+
+
+
+
+
+Constructs a `basic_any_resource` from a type that satisfies the `resource` concept.
+
+and that supports all of the specified properties.
+
+
+```cpp showLineNumbers={false}
+template
+cuda::mr::basic_any_resource<_Kind, _Properties>::basic_any_resource(
+ _Resource __res
+)
+```
+
+
+
+[`has_value()`](/library/api/cuda::mr::basic_any_resource::has_value()) is `true`
+
+
+
+`_Resource` is not a specialization of `basic_any_resource` or [`basic_resource_ref`](/library/api/cuda::mr::basic_resource_ref), or a type derived from such.
+
+
+
+[`synchronous_resource_with`](/library/api/cuda::mr::synchronous_resource_with)`<_Resource, _Properties...>` is `true`.
+
+
+
+If `_Kind` is [`_ResourceKind::_Asynchronous`](/library/api/cuda::mr::_Asynchronous), [`resource_with`](/library/api/cuda::mr::resource_with)`<_Resource, _Properties...>` is `true`.
+
+
+**Parameters**
+
+
+The resource to be wrapped by the `basic_any_resource`.
+
+
+
+
+
+Conversion from a type-erased resource with a superset of the required properties.
+
+
+```cpp showLineNumbers={false}
+template <_ResourceKind _OtherKind, class... _OtherProperties>
+cuda::mr::basic_any_resource<_Kind, _Properties>::basic_any_resource(
+ basic_any_resource<_OtherKind, _OtherProperties...> __res
+)
+```
+
+
+
+[`has_value()`](/library/api/cuda::mr::basic_any_resource::has_value()) is equal to `__res.has_value()`
+
+
+
+`_OtherKind` is equal to either `_Kind` or [`_ResourceKind::_Asynchronous`](/library/api/cuda::mr::_Asynchronous).
+
+
+
+The set `_OtherProperties...` is a superset of `_Properties...`.
+
+
+**Parameters**
+
+
+The object to copy from.
+
+
+
+
+
+Deep copy from a type-erased resource reference with a superset of the required properties.
+
+The object to which `__res` refers is copied into `*this`.
+
+
+```cpp showLineNumbers={false}
+template <_ResourceKind _OtherKind, class... _OtherProperties>
+cuda::mr::basic_any_resource<_Kind, _Properties>::basic_any_resource(
+ basic_resource_ref<_OtherKind, _OtherProperties...> __res
+)
+```
+
+
+
+`_OtherKind` is equal to either `_Kind` or [`_ResourceKind::_Asynchronous`](/library/api/cuda::mr::_Asynchronous).
+
+
+
+The set `_OtherProperties...` is a superset of `_Properties...`.
+
+
+**Parameters**
+
+
+The reference to copy from.
+
+
+
+
+
+---
+
+## Assignment operators
+
+### operator=
+
+
+
+
+noexcept
+
+Move assigns a `basic_any_resource`.
+
+
+```cpp showLineNumbers={false}
+basic_any_resource & cuda::mr::basic_any_resource<_Kind, _Properties>::operator=(
+ basic_any_resource &&__other
+) noexcept
+```
+
+
+
+[`has_value()`](/library/api/cuda::mr::basic_any_resource::has_value()) is `true` if `__other` had a value prior to the move, and `false` otherwise.
+
+
+
+`__other.has_value()` is `false`.
+
+
+
+
+
+Copy assigns a `basic_any_resource`.
+
+
+```cpp showLineNumbers={false}
+basic_any_resource & cuda::mr::basic_any_resource<_Kind, _Properties>::operator=(
+ const basic_any_resource &__other
+)
+```
+
+
+
+[`has_value()`](/library/api/cuda::mr::basic_any_resource::has_value()) is the same as `__other.has_value()`.
+
+
+
+
+
+Assigns from a type that satisfies the `resource` concept and that supports all of the specified properties.
+
+
+```cpp showLineNumbers={false}
+template
+basic_any_resource & cuda::mr::basic_any_resource<_Kind, _Properties>::operator=(
+ _Resource __res
+)
+```
+
+
+
+[`has_value()`](/library/api/cuda::mr::basic_any_resource::has_value()) is `true`
+
+
+
+`_Resource` is not a specialization of `basic_any_resource` or [`basic_resource_ref`](/library/api/cuda::mr::basic_resource_ref), or a type derived from such.
+
+
+
+[`synchronous_resource_with`](/library/api/cuda::mr::synchronous_resource_with)`<_Resource, _Properties...>` is `true`.
+
+
+
+If `_Kind` is [`_ResourceKind::_Asynchronous`](/library/api/cuda::mr::_Asynchronous), [`resource_with`](/library/api/cuda::mr::resource_with)`<_Resource, _Properties...>` is `true`.
+
+
+**Parameters**
+
+
+The resource to be wrapped within the `basic_any_resource`
+
+
+
+
+
+Assignment from a type-erased resource with a superset of the required properties.
+
+
+```cpp showLineNumbers={false}
+template <_ResourceKind _OtherKind, class... _OtherProperties>
+basic_any_resource & cuda::mr::basic_any_resource<_Kind, _Properties>::operator=(
+ basic_any_resource<_OtherKind, _OtherProperties...> __res
+)
+```
+
+
+
+[`has_value()`](/library/api/cuda::mr::basic_any_resource::has_value()) is equal to `__res.has_value()`.
+
+
+
+`_OtherKind` is equal to either `_Kind` or [`_ResourceKind::_Asynchronous`](/library/api/cuda::mr::_Asynchronous).
+
+
+
+The set `_OtherProperties...` is a superset of `_Properties...`.
+
+
+**Parameters**
+
+
+The object to copy from.
+
+
+
+
+
+Deep copy from a type-erased resource reference with a superset of the required properties.
+
+
+```cpp showLineNumbers={false}
+template <_ResourceKind _OtherKind, class... _OtherProperties>
+basic_any_resource & cuda::mr::basic_any_resource<_Kind, _Properties>::operator=(
+ basic_resource_ref<_OtherKind, _OtherProperties...> __res
+)
+```
+
+
+
+[`has_value()`](/library/api/cuda::mr::basic_any_resource::has_value()) is `true`.
+
+
+
+`_OtherKind` is equal to either `_Kind` or [`_ResourceKind::_Asynchronous`](/library/api/cuda::mr::_Asynchronous).
+
+
+
+The set `_OtherProperties...` is a superset of `_Properties...`.
+
+
+**Parameters**
+
+
+The type-erased resource reference to copy from.
+
+
+
+
+
+---
+
+## Methods
+
+### operator== const nodiscard
+
+
+
+
+Equality comparison between two type-erased memory resource.
+
+
+```cpp showLineNumbers={false}
+template <_ResourceKind _OtherKind, class... _OtherProperties>
+bool cuda::mr::basic_any_resource<_Kind, _Properties>::operator==(
+ const basic_any_resource<_OtherKind, _OtherProperties...> &__rhs
+) const
+```
+
+
+
+`_OtherKind` is equal to either `_Kind` or [`_ResourceKind::_Asynchronous`](/library/api/cuda::mr::_Asynchronous).
+
+
+
+The set `_Properties...` is equal to the set `_OtherProperties...`.
+
+
+**Returns:** `true` if both resources hold objects of the same type and those objects compare equal, and `false` otherwise.
+
+**Parameters**
+
+
+The type-erased resource to compare with `*this`.
+
+
+
+
+
+Equality comparison between `*this` and a type-erased resource reference.
+
+
+```cpp showLineNumbers={false}
+template <_ResourceKind _OtherKind, class... _OtherProperties>
+bool cuda::mr::basic_any_resource<_Kind, _Properties>::operator==(
+ const basic_resource_ref<_OtherKind, _OtherProperties...> &__rhs
+) const
+```
+
+
+
+`_OtherKind` is equal to either `_Kind` or [`_ResourceKind::_Asynchronous`](/library/api/cuda::mr::_Asynchronous).
+
+
+
+The set `_Properties...` is equal to the set `_OtherProperties...`.
+
+
+**Returns:** `true` if `__rhs` refers to an object of the same type as that wrapped by `*this` and those objects compare equal; `false` otherwise.
+
+**Parameters**
+
+
+The type-erased resource reference to compare with `*this`.
+
+
+
+
+
+### allocate_sync nodiscard
+
+Calls [`allocate_sync`](/library/api/cuda::mr::basic_any_resource::allocate_sync) on the wrapped object with the specified arguments.
+
+
+```cpp showLineNumbers={false}
+void * cuda::mr::basic_any_resource<_Kind, _Properties>::allocate_sync(
+ size_t __size,
+ size_t __align = alignof(cuda::std::max_align_t)
+)
+```
+
+
+
+[`has_value()`](/library/api/cuda::mr::basic_any_resource::has_value()) is `true`.
+
+
+**Returns:** `obj.allocate_sync(__size, __align)`, where `obj` is the wrapped object.
+
+### deallocate_sync
+
+Calls [`deallocate_sync`](/library/api/cuda::mr::basic_any_resource::deallocate_sync) on the wrapped object with the specified arguments.
+
+
+```cpp showLineNumbers={false}
+void cuda::mr::basic_any_resource<_Kind, _Properties>::deallocate_sync(
+ void *__pv,
+ size_t __size,
+ size_t __align = alignof(cuda::std::max_align_t)
+)
+```
+
+
+
+[`has_value()`](/library/api/cuda::mr::basic_any_resource::has_value()) is `true`.
+
+
+
+`__pv` must be a pointer that was previously returned by a call to `allocate` on the object wrapped by `*this`.
+
+
+**Returns:** `obj.deallocate_sync(__pv, __size, __align)`, where `obj` is the wrapped object.
+
+### allocate nodiscard
+
+
+
+
+Calls [`allocate`](/library/api/cuda::mr::basic_any_resource::allocate) on the wrapped object with the specified arguments.
+
+
+```cpp showLineNumbers={false}
+void * cuda::mr::basic_any_resource<_Kind, _Properties>::allocate(
+ cuda::stream_ref __stream,
+ size_t __size,
+ size_t __align
+)
+```
+
+
+
+The returned pointer is not valid until `__stream` has been synchronized.
+
+
+
+`_Kind` is [`_ResourceKind::_Asynchronous`](/library/api/cuda::mr::_Asynchronous).
+
+
+
+[`has_value()`](/library/api/cuda::mr::basic_any_resource::has_value()) is `true`.
+
+
+**Returns:** `obj.allocate(__stream, __size, __align)`, where `obj` is the wrapped object.
+
+
+
+
+Equivalent to `allocate(__stream, __size,
+alignof(::cuda::std::max_align_t))`.
+
+
+```cpp showLineNumbers={false}
+void * cuda::mr::basic_any_resource<_Kind, _Properties>::allocate(
+ cuda::stream_ref __stream,
+ size_t __size
+)
+```
+
+
+
+
+
+### deallocate
+
+
+
+
+Calls [`deallocate`](/library/api/cuda::mr::basic_any_resource::deallocate) on the wrapped object with the specified arguments.
+
+
+```cpp showLineNumbers={false}
+void cuda::mr::basic_any_resource<_Kind, _Properties>::deallocate(
+ cuda::stream_ref __stream,
+ void *__pv,
+ size_t __size,
+ size_t __align
+)
+```
+
+
+
+`_Kind` is [`_ResourceKind::_Asynchronous`](/library/api/cuda::mr::_Asynchronous).
+
+
+
+[`has_value()`](/library/api/cuda::mr::basic_any_resource::has_value()) is `true`.
+
+
+
+`__pv` must be a pointer that was previously returned by a call to `allocate` on the object wrapped by `*this`.
+
+
+**Returns:** `obj.deallocate(__stream, __pv, __size, __align)`, where `obj` is the wrapped object.
+
+
+
+
+Equivalent to `deallocate(__stream, __pv, __size,
+alignof(::cuda::std::max_align_t), __stream)`.
+
+
+```cpp showLineNumbers={false}
+void cuda::mr::basic_any_resource<_Kind, _Properties>::deallocate(
+ cuda::stream_ref __stream,
+ void *__pv,
+ size_t __size
+)
+```
+
+
+
+
+
+### has_value const noexcept nodiscard
+
+Checks if `*this` holds a value.
+
+
+```cpp showLineNumbers={false}
+bool cuda::mr::basic_any_resource<_Kind, _Properties>::has_value() const noexcept
+```
+
+
+**Returns:** `true` if `*this` holds a value; `false` otherwise.
+
+### reset noexcept
+
+Resets `*this` to the empty state.
+
+
+```cpp showLineNumbers={false}
+void cuda::mr::basic_any_resource<_Kind, _Properties>::reset() noexcept
+```
+
+
+
+[`has_value()`](/library/api/cuda::mr::basic_any_resource::has_value())` == false`
+
+
+### type const noexcept nodiscard
+
+
+```cpp showLineNumbers={false}
+const cuda::std::type_info & cuda::mr::basic_any_resource<_Kind, _Properties>::type() const noexcept
+```
+
+
+**Returns:** A reference to the `type_info` object for the wrapped resource, or `typeid(void)` if [`has_value()`](/library/api/cuda::mr::basic_any_resource::has_value()) is `false`.
diff --git a/fern/cudapages/cuda/cuda/cuda/mr/basic_resource_ref.mdx b/fern/cudapages/cuda/cuda/cuda/mr/basic_resource_ref.mdx
new file mode 100644
index 0000000..757cb31
--- /dev/null
+++ b/fern/cudapages/cuda/cuda/cuda/mr/basic_resource_ref.mdx
@@ -0,0 +1,375 @@
+---
+title: "cuda::mr::basic_resource_ref"
+description: "Type erased wrapper around a reference to an object that satisfies the `resource` concept and that provides the requested `_Properties`."
+---
+
+Type erased wrapper around a reference to an object that satisfies the `resource` concept and that provides the requested `_Properties`.
+
+`basic_resource_ref` models the `cuda::std::copyable` and `cuda::std::equality_comparable` concepts.
+
+```cpp showLineNumbers={false}
+#include
+```
+
+
+
+
+
+
+
+
+The properties that any resource wrapped within the `basic_resource_ref` needs to provide.
+
+
+
+
+
+---
+
+## Constructors
+
+### basic_resource_ref
+
+
+
+
+Copy constructs a `basic_resource_ref`.
+
+
+```cpp showLineNumbers={false}
+cuda::mr::basic_resource_ref<_Kind, _Properties>::basic_resource_ref(
+ const basic_resource_ref &__other
+)
+```
+
+
+
+`*this` and `__other` both refer to the same resource object.
+
+
+
+
+
+Constructs a `basic_resource_ref` from a reference to a type that satisfies the `resource` concept and that supports all of the specified properties.
+
+
+```cpp showLineNumbers={false}
+template
+cuda::mr::basic_resource_ref<_Kind, _Properties>::basic_resource_ref(
+ _Resource &__res
+)
+```
+
+
+
+[`synchronous_resource_with`](/library/api/cuda::mr::synchronous_resource_with)`<_Resource, _Properties...>` is `true`.
+
+
+
+If `_Kind` is [`_ResourceKind::_Asynchronous`](/library/api/cuda::mr::_Asynchronous), [`resource_with`](/library/api/cuda::mr::resource_with)`<_Resource, _Properties...>` is `true`.
+
+
+
+If `__res` refers to a specialization of [`basic_any_resource`](/library/api/cuda::mr::basic_any_resource) or a type derived from such, `__res.has_value()` is `true`.
+
+
+**Parameters**
+
+
+The resource reference to be wrapped.
+
+
+
+
+
+Conversion from type-erased resource reference with a superset of the required properties.
+
+
+```cpp showLineNumbers={false}
+template <_ResourceKind _OtherKind, class... _OtherProperties>
+cuda::mr::basic_resource_ref<_Kind, _Properties>::basic_resource_ref(
+ basic_resource_ref<_OtherKind, _OtherProperties...> __res
+)
+```
+
+
+
+`*this` and `__res` both refer to the same resource object.
+
+
+
+`_OtherKind` is equal to either `_Kind` or [`_ResourceKind::_Asynchronous`](/library/api/cuda::mr::_Asynchronous).
+
+
+
+The set `_OtherProperties...` is a superset of `_Properties...`.
+
+
+**Parameters**
+
+
+The other type-erased resource reference to copy from.
+
+
+
+
+
+---
+
+## Assignment operators
+
+### operator=
+
+
+
+
+Rebinds `*this` to refer to the object to which `__other` refers.
+
+
+```cpp showLineNumbers={false}
+basic_resource_ref & cuda::mr::basic_resource_ref<_Kind, _Properties>::operator=(
+ const basic_resource_ref &__other
+)
+```
+
+
+
+`*this` and `__other` both refer to the same resource object.
+
+
+
+
+
+Rebinds the wrapped reference to an object whose type satisfies the `resource` concept and that supports all of the specified properties.
+
+
+```cpp showLineNumbers={false}
+template
+basic_resource_ref & cuda::mr::basic_resource_ref<_Kind, _Properties>::operator=(
+ _Resource &__res
+)
+```
+
+
+
+[`synchronous_resource_with`](/library/api/cuda::mr::synchronous_resource_with)`<_Resource, _Properties...>` is `true`.
+
+
+
+If `_Kind` is [`_ResourceKind::_Asynchronous`](/library/api/cuda::mr::_Asynchronous), [`synchronous_resource_with`](/library/api/cuda::mr::synchronous_resource_with)`<_Resource, _Properties...>` is `true`.
+
+
+
+If `__res` refers to a specialization of [`basic_any_resource`](/library/api/cuda::mr::basic_any_resource) or a type derived from such, `__res.has_value()` is `true`.
+
+
+**Parameters**
+
+
+The reference to the resource to be wrapped by the `basic_resource_ref`.
+
+
+
+
+
+Rebinds `*this` to refer to the object to which `__other` refers.
+
+
+```cpp showLineNumbers={false}
+template <_ResourceKind _OtherKind, class... _OtherProperties>
+basic_resource_ref & cuda::mr::basic_resource_ref<_Kind, _Properties>::operator=(
+ basic_resource_ref<_OtherKind, _OtherProperties...> __res
+)
+```
+
+
+
+`*this` and `__res` both refer to the same resource object.
+
+
+
+`_OtherKind` is equal to either `_Kind` or [`_ResourceKind::_Asynchronous`](/library/api/cuda::mr::_Asynchronous).
+
+
+
+The set `_OtherProperties...` is a superset of `_Properties...`.
+
+
+**Parameters**
+
+
+The other type-erased resource reference to copy from.
+
+
+
+
+
+---
+
+## Methods
+
+### operator== const nodiscard
+
+Equality comparison between two type-erased resource references.
+
+
+```cpp showLineNumbers={false}
+template <_ResourceKind _OtherKind, class... _OtherProperties>
+bool cuda::mr::basic_resource_ref<_Kind, _Properties>::operator==(
+ const basic_resource_ref<_OtherKind, _OtherProperties...> &__rhs
+) const
+```
+
+
+
+`_OtherKind` is equal to either `_Kind` or [`_ResourceKind::_Asynchronous`](/library/api/cuda::mr::_Asynchronous).
+
+
+
+The set `_Properties...` is equal to the set `_OtherProperties...`.
+
+
+**Returns:** `true` if both resources refer to objects of the same type and those objects compare equal. Otherwise, returns `false`.
+
+**Parameters**
+
+
+The other type-erased resource reference.
+
+
+### allocate_sync nodiscard
+
+Calls [`allocate_sync`](/library/api/cuda::mr::basic_resource_ref::allocate_sync) on the wrapped reference with the specified arguments.
+
+
+```cpp showLineNumbers={false}
+void * cuda::mr::basic_resource_ref<_Kind, _Properties>::allocate_sync(
+ size_t __size,
+ size_t __align = alignof(cuda::std::max_align_t)
+)
+```
+
+
+**Returns:** `obj.allocate_sync(__size, __align)`, where `obj` is the wrapped reference.
+
+### deallocate_sync
+
+Calls [`deallocate_sync`](/library/api/cuda::mr::basic_resource_ref::deallocate_sync) on the wrapped reference with the specified arguments.
+
+
+```cpp showLineNumbers={false}
+void cuda::mr::basic_resource_ref<_Kind, _Properties>::deallocate_sync(
+ void *__pv,
+ size_t __size,
+ size_t __align = alignof(cuda::std::max_align_t)
+)
+```
+
+
+
+`__pv` must be a pointer that was previously returned by a call to `allocate` on the object referenced by `*this`.
+
+
+**Returns:** `obj.deallocate_sync(__pv, __size, __align)`, where `obj` is the wrapped reference.
+
+### allocate nodiscard
+
+
+
+
+Calls [`allocate`](/library/api/cuda::mr::basic_resource_ref::allocate) on the wrapped reference with the specified arguments.
+
+
+```cpp showLineNumbers={false}
+void * cuda::mr::basic_resource_ref<_Kind, _Properties>::allocate(
+ cuda::stream_ref __stream,
+ size_t __size,
+ size_t __align
+)
+```
+
+
+
+The returned pointer is not valid until `__stream` has been synchronized.
+
+
+
+`_Kind` is [`_ResourceKind::_Asynchronous`](/library/api/cuda::mr::_Asynchronous).
+
+
+**Returns:** `obj.allocate(__stream, __size, __align)`, where `obj` is the wrapped reference.
+
+
+
+
+Equivalent to `allocate(__stream, __size,
+alignof(::cuda::std::max_align_t))`.
+
+
+```cpp showLineNumbers={false}
+void * cuda::mr::basic_resource_ref<_Kind, _Properties>::allocate(
+ cuda::stream_ref __stream,
+ size_t __size
+)
+```
+
+
+
+
+
+### deallocate
+
+
+
+
+Calls [`deallocate`](/library/api/cuda::mr::basic_resource_ref::deallocate) on the wrapped reference with the specified arguments.
+
+
+```cpp showLineNumbers={false}
+void cuda::mr::basic_resource_ref<_Kind, _Properties>::deallocate(
+ cuda::stream_ref __stream,
+ void *__pv,
+ size_t __size,
+ size_t __align
+)
+```
+
+
+
+`_Kind` is [`_ResourceKind::_Asynchronous`](/library/api/cuda::mr::_Asynchronous).
+
+
+
+`__pv` must be a pointer that was previously returned by a call to `allocate` on the object referenced by `*this`.
+
+
+**Returns:** `obj.deallocate(__stream, __pv, __size, __align)`, where `obj` is the wrapped reference.
+
+
+
+
+Equivalent to `deallocate(__stream, __pv, __size,
+alignof(::cuda::std::max_align_t), __stream)`.
+
+
+```cpp showLineNumbers={false}
+void cuda::mr::basic_resource_ref<_Kind, _Properties>::deallocate(
+ cuda::stream_ref __stream,
+ void *__pv,
+ size_t __size
+)
+```
+
+
+
+
+
+### type const noexcept nodiscard
+
+
+```cpp showLineNumbers={false}
+const cuda::std::type_info & cuda::mr::basic_resource_ref<_Kind, _Properties>::type() const noexcept
+```
+
+
+**Returns:** A reference to the `type_info` object for the type of the object to which `*this` refers.
diff --git a/fern/cudapages/cuda/cuda/cuda/mr/device_accessible.mdx b/fern/cudapages/cuda/cuda/cuda/mr/device_accessible.mdx
new file mode 100644
index 0000000..c7aade1
--- /dev/null
+++ b/fern/cudapages/cuda/cuda/cuda/mr/device_accessible.mdx
@@ -0,0 +1,10 @@
+---
+title: "cuda::mr::device_accessible"
+description: "The [device_accessible](/library/api/cuda::mr::device_accessible) property signals that the allocated memory is device accessible."
+---
+
+The `device_accessible` property signals that the allocated memory is device accessible.
+
+```cpp showLineNumbers={false}
+#include
+```
diff --git a/fern/cudapages/cuda/cuda/cuda/mr/host_accessible.mdx b/fern/cudapages/cuda/cuda/cuda/mr/host_accessible.mdx
new file mode 100644
index 0000000..733e9f2
--- /dev/null
+++ b/fern/cudapages/cuda/cuda/cuda/mr/host_accessible.mdx
@@ -0,0 +1,10 @@
+---
+title: "cuda::mr::host_accessible"
+description: "The [device_accessible](/library/api/cuda::mr::device_accessible) property signals that the allocated memory is host accessible."
+---
+
+The [device_accessible](/library/api/cuda::mr::device_accessible) property signals that the allocated memory is host accessible.
+
+```cpp showLineNumbers={false}
+#include
+```
diff --git a/fern/cudapages/cuda/cuda/cuda/mr/legacy_managed_memory_resource.mdx b/fern/cudapages/cuda/cuda/cuda/mr/legacy_managed_memory_resource.mdx
new file mode 100644
index 0000000..b5a4792
--- /dev/null
+++ b/fern/cudapages/cuda/cuda/cuda/mr/legacy_managed_memory_resource.mdx
@@ -0,0 +1,120 @@
+---
+title: "cuda::mr::legacy_managed_memory_resource"
+description: "`managed_memory_resource` uses `cudaMallocManaged` / `cudaFree` for allocation / deallocation."
+---
+
+`managed_memory_resource` uses `cudaMallocManaged` / `cudaFree` for allocation / deallocation.
+
+```cpp showLineNumbers={false}
+#include
+```
+
+---
+
+## Constructors
+
+### legacy_managed_memory_resource inline constexpr noexcept
+
+Construct a new `legacy_managed_memory_resource`.
+
+
+```cpp showLineNumbers={false}
+cuda::mr::legacy_managed_memory_resource::legacy_managed_memory_resource(
+ const unsigned int __flags = cudaMemAttachGlobal,
+ device_ref __device = {0}
+) noexcept
+```
+
+
+
+Synchronous allocations in CUDA are tied to a device, even if not located in device memory. This constructor takes an optional device argument to specify the device that should be tied to allocations for the resource. This association has the effect of initializing that device and the memory being implicitly freed if the device is reset.
+
+
+---
+
+## Methods
+
+### allocate_sync inline nodiscard
+
+Allocate CUDA unified memory of size at least `__bytes`.
+
+
+```cpp showLineNumbers={false}
+void * cuda::mr::legacy_managed_memory_resource::allocate_sync(
+ const size_t __bytes,
+ const size_t __alignment = ::cuda::mr::default_cuda_malloc_alignment
+)
+```
+
+
+**Returns:** Pointer to the newly allocated memory
+
+**Throws:** `std::invalid_argument` in case of invalid alignment or `cuda::cuda_error` of the returned error code.
+
+**Parameters**
+
+
+The size in bytes of the allocation.
+
+
+
+The requested alignment of the allocation.
+
+
+### deallocate_sync inline noexcept
+
+Deallocate memory pointed to by `__ptr`.
+
+
+```cpp showLineNumbers={false}
+void cuda::mr::legacy_managed_memory_resource::deallocate_sync(
+ void *__ptr,
+ const size_t __bytes,
+ const size_t __alignment = ::cuda::mr::default_cuda_malloc_alignment
+) noexcept
+```
+
+
+**Parameters**
+
+
+Pointer to be deallocated. Must have been allocated through a call to `allocate` or [`allocate_sync`](/library/api/cuda::mr::legacy_managed_memory_resource::allocate_sync)
+
+
+
+The number of bytes that was passed to the allocation call that returned `__ptr`.
+
+
+
+The alignment that was passed to the allocation call that returned `__ptr`.
+
+
+### operator== inline constexpr const noexcept nodiscard
+
+Equality comparison with another `managed_memory_resource`.
+
+
+```cpp showLineNumbers={false}
+bool cuda::mr::legacy_managed_memory_resource::operator==(
+ legacy_managed_memory_resource const &__other
+) const noexcept
+```
+
+
+**Returns:** Whether both `managed_memory_resource` were constructed with the same flags.
+
+**Parameters**
+
+
+The other `managed_memory_resource`.
+
+
+---
+
+## Types
+
+### Typedefs
+
+| Name | Definition |
+|---|---|
+| `default_queries` | `::cuda::mr::properties_list<::cuda::mr::device_accessible, ::cuda::mr::host_accessible >` |
diff --git a/fern/cudapages/cuda/cuda/cuda/mr/legacy_pinned_memory_resource.mdx b/fern/cudapages/cuda/cuda/cuda/mr/legacy_pinned_memory_resource.mdx
new file mode 100644
index 0000000..8c01ace
--- /dev/null
+++ b/fern/cudapages/cuda/cuda/cuda/mr/legacy_pinned_memory_resource.mdx
@@ -0,0 +1,117 @@
+---
+title: "cuda::mr::legacy_pinned_memory_resource"
+description: "[legacy_pinned_memory_resource](/library/api/cuda::mr::legacy_pinned_memory_resource) uses `cudaMallocHost` / `cudaFreeAsync` for allocation / deallocation."
+---
+
+`legacy_pinned_memory_resource` uses `cudaMallocHost` / `cudaFreeAsync` for allocation / deallocation.
+
+```cpp showLineNumbers={false}
+#include
+```
+
+
+This memory resource will be deprecated in the future. For CUDA 12.6 and above, use `cuda::pinned_memory_resource` instead, which is the long-term replacement.
+
+
+---
+
+## Constructors
+
+### legacy_pinned_memory_resource inline constexpr noexcept
+
+Construct a new `legacy_pinned_memory_resource`.
+
+
+```cpp showLineNumbers={false}
+cuda::mr::legacy_pinned_memory_resource::legacy_pinned_memory_resource(
+ ::cuda::device_ref __device = {0}
+) noexcept
+```
+
+
+
+Synchronous allocations in CUDA are tied to a device, even if not located in device memory. This constructor takes an optional device argument to specify the device that should be tied to allocations for the resource. This association has the effect of initializing that device and the memory being implicitly freed if the device is reset.
+
+
+---
+
+## Methods
+
+### allocate_sync inline nodiscard
+
+Allocate host memory of size at least `__bytes`.
+
+
+```cpp showLineNumbers={false}
+void * cuda::mr::legacy_pinned_memory_resource::allocate_sync(
+ const size_t __bytes,
+ const size_t __alignment = ::cuda::mr::default_cuda_malloc_alignment
+)
+```
+
+
+**Returns:** Pointer to the newly allocated memory
+
+**Throws:** `std::invalid_argument` in case of invalid alignment or `cuda::cuda_error` of the returned error code.
+
+**Parameters**
+
+
+The size in bytes of the allocation.
+
+
+
+The requested alignment of the allocation.
+
+
+### deallocate_sync inline noexcept
+
+Deallocate memory pointed to by `__ptr`.
+
+
+```cpp showLineNumbers={false}
+void cuda::mr::legacy_pinned_memory_resource::deallocate_sync(
+ void *__ptr,
+ const size_t __bytes,
+ const size_t __alignment = ::cuda::mr::default_cuda_malloc_alignment
+) noexcept
+```
+
+
+**Parameters**
+
+
+Pointer to be deallocated. Must have been allocated through a call to [`allocate_sync`](/library/api/cuda::mr::legacy_pinned_memory_resource::allocate_sync).
+
+
+
+The number of bytes that was passed to the allocation call that returned `__ptr`.
+
+
+
+The alignment that was passed to the allocation call that returned `__ptr`.
+
+
+### operator== inline constexpr const noexcept nodiscard
+
+Equality comparison with another `legacy_pinned_memory_resource`.
+
+
+```cpp showLineNumbers={false}
+bool cuda::mr::legacy_pinned_memory_resource::operator==(
+ legacy_pinned_memory_resource const &
+) const noexcept
+```
+
+
+**Returns:** Whether both `legacy_pinned_memory_resource` were constructed with the same flags.
+
+---
+
+## Types
+
+### Typedefs
+
+| Name | Definition |
+|---|---|
+| `default_queries` | `::cuda::mr::properties_list<::cuda::mr::device_accessible, ::cuda::mr::host_accessible >` |
diff --git a/fern/cudapages/cuda/cuda/cuda/mr/properties_list.mdx b/fern/cudapages/cuda/cuda/cuda/mr/properties_list.mdx
new file mode 100644
index 0000000..5c4bf6e
--- /dev/null
+++ b/fern/cudapages/cuda/cuda/cuda/mr/properties_list.mdx
@@ -0,0 +1,45 @@
+---
+title: "cuda::mr::properties_list"
+description: "A type representing a list of memory resource properties."
+---
+
+A type representing a list of memory resource properties.
+
+```cpp showLineNumbers={false}
+#include
+```
+
+
+
+
+
+The properties to be included in the list It has a member template [`rebind`](/library/api/cuda::mr::properties_list::rebind) that allows constructing a type by combining a template and type arguments with the properties from this list. The properties are appended after the type arguments in the resulting type.
+
+
+
+
+
+---
+
+## Static methods
+
+### has_property inline static constexpr
+
+
+```cpp showLineNumbers={false}
+template
+static constexpr bool cuda::mr::properties_list<_Properties>::has_property(
+ _QueryProperty
+)
+```
+
+
+---
+
+## Types
+
+### Typedefs
+
+| Name | Definition | Description |
+|---|---|---|
+| `rebind` | `_Fn< _ExtraArgs..., _Properties... >` | A type alias for a type template instantiated with the properties from this list appended to the type arguments. |
diff --git a/fern/cudapages/cuda/cuda/cuda/mr/resource.mdx b/fern/cudapages/cuda/cuda/cuda/mr/resource.mdx
new file mode 100644
index 0000000..322624d
--- /dev/null
+++ b/fern/cudapages/cuda/cuda/cuda/mr/resource.mdx
@@ -0,0 +1,39 @@
+---
+title: "cuda::mr::resource"
+description: "The `resource` concept verifies that a type Resource satisfies the basic requirements of a memory resource and additionally supports stream ordered allocations."
+---
+
+C++20 concept
+
+The `resource` concept verifies that a type Resource satisfies the basic requirements of a memory resource and additionally supports stream ordered allocations.
+
+
+```cpp showLineNumbers={false}
+template
+concept resource = /* see description */;
+```
+
+
+
+
+
+
+The type that should implement the resource concept
+
+
+
+
+
+---
+
+## Description
+
+We require that an resource supports the following interface
+
+ - `allocate(size_t bytes, size_t alignment)`
+ - `deallocate(void* ptr, size_t bytes, size_t alignment)`
+ - `T() == T()`
+ - `T() != T()`
+
+ - `allocate(cuda::stream_ref stream, size_t bytes, size_t alignment)`
+ - `deallocate( cuda::stream_ref stream, void* ptr, size_t bytes, size_t alignment)`
diff --git a/fern/cudapages/cuda/cuda/cuda/mr/resource_with.mdx b/fern/cudapages/cuda/cuda/cuda/mr/resource_with.mdx
new file mode 100644
index 0000000..f6d8218
--- /dev/null
+++ b/fern/cudapages/cuda/cuda/cuda/mr/resource_with.mdx
@@ -0,0 +1,27 @@
+---
+title: "cuda::mr::resource_with"
+description: "The `resource_with` concept verifies that a type Resource satisfies the [`resource`](/library/api/cuda::mr::resource) concept and also satisfies all the provided Properties."
+---
+
+C++20 concept
+
+The `resource_with` concept verifies that a type Resource satisfies the [`resource`](/library/api/cuda::mr::resource) concept and also satisfies all the provided Properties.
+
+
+```cpp showLineNumbers={false}
+template
+concept resource_with = /* see description */;
+```
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/fern/cudapages/cuda/cuda/cuda/mr/shared_resource.mdx b/fern/cudapages/cuda/cuda/cuda/mr/shared_resource.mdx
new file mode 100644
index 0000000..cd7c091
--- /dev/null
+++ b/fern/cudapages/cuda/cuda/cuda/mr/shared_resource.mdx
@@ -0,0 +1,435 @@
+---
+title: "cuda::mr::shared_resource"
+description: ""
+---
+
+`shared_resource` holds a reference counted instance of a memory resource. This allows the user to pass a resource around with reference semantics while avoiding lifetime issues.
+
+@note `shared_resource` satisfies the `cuda::mr::resource` concept iff \tparam _Resource satisfies it. @tparam _Resource The resource type to hold.
+
+```cpp showLineNumbers={false}
+#include
+```
+
+
+
+
+
+
+
+
+
+
+**Inherits from:** `cuda::mr::__copy_default_queries< _Resource >` (public)
+
+---
+
+## Constructors
+
+### shared_resource inline
+
+
+
+
+explicit
+
+Constructs a `shared_resource` referring to an object of type `_Resource` that has been constructed with arguments `__args`.
+
+The `_Resource` object is dynamically allocated with `new`.
+
+
+```cpp showLineNumbers={false}
+template
+cuda::mr::shared_resource<_Resource>::shared_resource(
+ ::cuda::std::in_place_type_t<_Resource>,
+ _Args &&... __args
+)
+```
+
+
+**Parameters**
+
+
+The arguments to be passed to the `_Resource` constructor.
+
+
+
+
+
+noexcept
+
+Copy-constructs a `shared_resource` object resulting in an copy that shares ownership of the wrapped resource with `__other`.
+
+
+```cpp showLineNumbers={false}
+cuda::mr::shared_resource<_Resource>::shared_resource(
+ const shared_resource &__other
+) noexcept
+```
+
+
+**Parameters**
+
+
+The `shared_resource` object to copy from.
+
+
+
+
+
+noexcept
+
+Move-constructs a `shared_resource` assuming ownership of the resource stored in `__other`.
+
+
+```cpp showLineNumbers={false}
+cuda::mr::shared_resource<_Resource>::shared_resource(
+ shared_resource &&__other
+) noexcept
+```
+
+
+
+`__other` is left in a valid but unspecified state.
+
+
+**Parameters**
+
+
+The `shared_resource` object to move from.
+
+
+
+
+
+### Destructor
+
+### ~shared_resource inline
+
+Releases the reference held by this `shared_resource` object.
+
+If this is the last reference to the wrapped resource, the resource is deleted.
+
+
+```cpp showLineNumbers={false}
+cuda::mr::shared_resource<_Resource>::~shared_resource()
+```
+
+
+---
+
+## Assignment operators
+
+### operator= inline noexcept
+
+
+
+
+Copy-assigns from `__other`.
+
+Self-assignment is a no-op. Otherwise, the reference held by this `shared_resource` object is released and a new reference is acquired to the wrapped resource of `__other`, if any.
+
+
+```cpp showLineNumbers={false}
+shared_resource & cuda::mr::shared_resource<_Resource>::operator=(
+ const shared_resource &__other
+) noexcept
+```
+
+
+**Parameters**
+
+
+The `shared_resource` object to copy from.
+
+
+
+
+
+Move-assigns from `__other`.
+
+Self-assignment is a no-op. Otherwise, the reference held by this `shared_resource` object is released, while the reference held by `__other` is transferred to this object.
+
+
+```cpp showLineNumbers={false}
+shared_resource & cuda::mr::shared_resource<_Resource>::operator=(
+ shared_resource &&__other
+) noexcept
+```
+
+
+
+`__other` is left in a valid but unspecified state.
+
+
+**Parameters**
+
+
+The `shared_resource` object to move from.
+
+
+
+
+
+---
+
+## Methods
+
+### swap inline noexcept
+
+Swaps a `shared_resource` with another one.
+
+
+```cpp showLineNumbers={false}
+void cuda::mr::shared_resource<_Resource>::swap(
+ shared_resource &__other
+) noexcept
+```
+
+
+**Parameters**
+
+
+The other `shared_resource`.
+
+
+### get inline noexcept nodiscard
+
+
+
+
+Returns a reference to the stored resource.
+
+
+```cpp showLineNumbers={false}
+_Resource & cuda::mr::shared_resource<_Resource>::get() noexcept
+```
+
+
+**Returns:** A reference to the stored resource.
+
+
+
+
+const
+
+Returns a const reference to the stored resource.
+
+
+```cpp showLineNumbers={false}
+const _Resource & cuda::mr::shared_resource<_Resource>::get() const noexcept
+```
+
+
+**Returns:** A const reference to the stored resource.
+
+
+
+
+### operator-> inline noexcept nodiscard
+
+
+
+
+Returns a pointer to the stored resource.
+
+
+```cpp showLineNumbers={false}
+_Resource * cuda::mr::shared_resource<_Resource>::operator->() noexcept
+```
+
+
+**Returns:** A pointer to the stored resource.
+
+
+
+
+const
+
+Returns a const pointer to the stored resource.
+
+
+```cpp showLineNumbers={false}
+const _Resource * cuda::mr::shared_resource<_Resource>::operator->() const noexcept
+```
+
+
+**Returns:** A const pointer to the stored resource.
+
+
+
+
+### operator* inline noexcept nodiscard
+
+
+
+
+Returns a reference to the stored resource.
+
+
+```cpp showLineNumbers={false}
+_Resource & cuda::mr::shared_resource<_Resource>::operator*() noexcept
+```
+
+
+**Returns:** A reference to the stored resource.
+
+
+
+
+const
+
+Returns a const reference to the stored resource.
+
+
+```cpp showLineNumbers={false}
+const _Resource & cuda::mr::shared_resource<_Resource>::operator*() const noexcept
+```
+
+
+**Returns:** A const reference to the stored resource.
+
+
+
+
+### allocate_sync inline nodiscard
+
+Allocate memory of size at least `__bytes` using the stored resource.
+
+
+```cpp showLineNumbers={false}
+void * cuda::mr::shared_resource<_Resource>::allocate_sync(
+ size_t __bytes,
+ size_t __alignment = alignof(::cuda::std::max_align_t)
+)
+```
+
+
+**Returns:** Pointer to the newly allocated memory
+
+**Parameters**
+
+
+The size in bytes of the allocation.
+
+
+
+The requested alignment of the allocation.
+
+
+### deallocate_sync inline noexcept
+
+Deallocate memory pointed to by `__ptr` using the stored resource.
+
+
+```cpp showLineNumbers={false}
+void cuda::mr::shared_resource<_Resource>::deallocate_sync(
+ void *__ptr,
+ size_t __bytes,
+ size_t __alignment = alignof(::cuda::std::max_align_t)
+) noexcept
+```
+
+
+**Parameters**
+
+
+Pointer to be deallocated. Must have been allocated through a call to [`allocate`](/library/api/cuda::mr::shared_resource::allocate) or [`allocate_sync`](/library/api/cuda::mr::shared_resource::allocate_sync)
+
+
+
+The number of bytes that was passed to the allocation call that returned `__ptr`.
+
+
+
+The alignment that was passed to the allocation call that returned `__ptr`.
+
+
+### allocate inline nodiscard
+
+Enqueues an allocation of memory of size at least `__bytes` using the wrapped resource.
+
+The allocation is performed asynchronously on stream `__stream`.
+
+
+```cpp showLineNumbers={false}
+template
+void * cuda::mr::shared_resource<_Resource>::allocate(
+ ::cuda::stream_ref __stream,
+ size_t __bytes,
+ size_t __alignment
+)
+```
+
+
+
+The caller is responsible for ensuring that the memory is not accessed until the operation has completed.
+
+
+
+`_Resource` must satisfy `resource`.
+
+
+**Returns:** Pointer to the newly allocated memory.
+
+**Parameters**
+
+
+The size in bytes of the allocation.
+
+
+
+The requested alignment of the allocation.
+
+
+### deallocate inline noexcept
+
+Enqueues the deallocation of memory pointed to by `__ptr`.
+
+The deallocation is performed asynchronously on stream `__stream`.
+
+
+```cpp showLineNumbers={false}
+template
+void cuda::mr::shared_resource<_Resource>::deallocate(
+ ::cuda::stream_ref __stream,
+ void *__ptr,
+ size_t __bytes,
+ size_t __alignment
+) noexcept
+```
+
+
+
+The caller is responsible for ensuring that the memory is not accessed after the operation has completed.
+
+
+
+`_Resource` must satisfy `resource`.
+
+
+**Parameters**
+
+
+Pointer to be deallocated. Must have been allocated through a call to [`allocate`](/library/api/cuda::mr::shared_resource::allocate) or [`allocate_sync`](/library/api/cuda::mr::shared_resource::allocate_sync)
+
+
+
+The number of bytes that was passed to the allocation call that returned `__ptr`.
+
+
+
+The alignment that was passed to the allocation call that returned `__ptr`.
+
+
+---
+
+## Inner classes
+
+### _Control_block
+
+
+```cpp showLineNumbers={false}
+struct cuda::mr::shared_resource::_Control_block
+```
+
diff --git a/fern/cudapages/cuda/cuda/cuda/mr/synchronous_resource.mdx b/fern/cudapages/cuda/cuda/cuda/mr/synchronous_resource.mdx
new file mode 100644
index 0000000..3743179
--- /dev/null
+++ b/fern/cudapages/cuda/cuda/cuda/mr/synchronous_resource.mdx
@@ -0,0 +1,36 @@
+---
+title: "cuda::mr::synchronous_resource"
+description: "The `synchronous_resource` concept verifies that a type Resource satisfies the basic requirements of a memory resource."
+---
+
+C++20 concept
+
+The `synchronous_resource` concept verifies that a type Resource satisfies the basic requirements of a memory resource.
+
+
+```cpp showLineNumbers={false}
+template
+concept synchronous_resource = /* see description */;
+```
+
+
+
+
+
+
+The type that should implement the synchronous resource concept
+
+
+
+
+
+---
+
+## Description
+
+We require that a resource supports the following interface
+
+ - `allocate(size_t bytes, size_t alignment)`
+ - `deallocate(void* ptr, size_t bytes, size_t alignment)`
+ - `T() == T()`
+ - `T() != T()`
diff --git a/fern/cudapages/cuda/cuda/cuda/mr/synchronous_resource_adapter.mdx b/fern/cudapages/cuda/cuda/cuda/mr/synchronous_resource_adapter.mdx
new file mode 100644
index 0000000..e34ea13
--- /dev/null
+++ b/fern/cudapages/cuda/cuda/cuda/mr/synchronous_resource_adapter.mdx
@@ -0,0 +1,146 @@
+---
+title: "cuda::mr::synchronous_resource_adapter"
+description: "Adapter that allows a synchronous resource to be used as a resource It examines the resource for the presence of the allocate and deallocate members."
+---
+
+Adapter that allows a synchronous resource to be used as a resource It examines the resource for the presence of the allocate and deallocate members.
+
+If they are present, it passes through the allocate and deallocate calls to the contained resource. Otherwise, it uses the allocate_sync and deallocate_sync members (with proper synchronization in case of deallocate).
+
+```cpp showLineNumbers={false}
+#include
+```
+
+
+This adapter takes ownership of the contained resource.
+
+
+
+
+
+
+The type of the resource to be adapted
+
+
+
+
+
+**Inherits from:** `cuda::mr::__copy_default_queries< _Resource >` (public), `cuda::forward_property< synchronous_resource_adapter< _Resource >, _Resource >` (public)
+
+---
+
+## Constructors
+
+### synchronous_resource_adapter inline noexcept
+
+
+
+
+
+```cpp showLineNumbers={false}
+cuda::mr::synchronous_resource_adapter<_Resource>::synchronous_resource_adapter(
+ const _Resource &__resource
+) noexcept
+```
+
+
+
+
+
+
+```cpp showLineNumbers={false}
+cuda::mr::synchronous_resource_adapter<_Resource>::synchronous_resource_adapter(
+ _Resource &&__resource
+) noexcept
+```
+
+
+
+
+
+---
+
+## Methods
+
+### allocate inline nodiscard
+
+
+```cpp showLineNumbers={false}
+void * cuda::mr::synchronous_resource_adapter<_Resource>::allocate(
+ const ::cuda::stream_ref __stream,
+ const size_t __bytes,
+ const size_t __alignment
+)
+```
+
+
+### allocate_sync inline nodiscard
+
+
+```cpp showLineNumbers={false}
+void * cuda::mr::synchronous_resource_adapter<_Resource>::allocate_sync(
+ const size_t __bytes,
+ const size_t __alignment
+)
+```
+
+
+### deallocate inline noexcept
+
+
+```cpp showLineNumbers={false}
+void cuda::mr::synchronous_resource_adapter<_Resource>::deallocate(
+ const ::cuda::stream_ref __stream,
+ void *__ptr,
+ const size_t __bytes,
+ const size_t __alignment
+) noexcept
+```
+
+
+### deallocate_sync inline noexcept
+
+
+```cpp showLineNumbers={false}
+void cuda::mr::synchronous_resource_adapter<_Resource>::deallocate_sync(
+ void *__ptr,
+ const size_t __bytes,
+ const size_t __alignment
+) noexcept
+```
+
+
+### operator== inline const noexcept nodiscard
+
+
+```cpp showLineNumbers={false}
+bool cuda::mr::synchronous_resource_adapter<_Resource>::operator==(
+ const synchronous_resource_adapter &__rhs
+) const noexcept
+```
+
+
+### upstream_resource inline noexcept
+
+
+
+
+
+```cpp showLineNumbers={false}
+_Resource & cuda::mr::synchronous_resource_adapter<_Resource>::upstream_resource() noexcept
+```
+
+
+
+
+
+const
+
+
+```cpp showLineNumbers={false}
+const _Resource & cuda::mr::synchronous_resource_adapter<_Resource>::upstream_resource() const noexcept
+```
+
+
+
+
diff --git a/fern/cudapages/cuda/cuda/cuda/mr/synchronous_resource_with.mdx b/fern/cudapages/cuda/cuda/cuda/mr/synchronous_resource_with.mdx
new file mode 100644
index 0000000..bbc20c6
--- /dev/null
+++ b/fern/cudapages/cuda/cuda/cuda/mr/synchronous_resource_with.mdx
@@ -0,0 +1,27 @@
+---
+title: "cuda::mr::synchronous_resource_with"
+description: "The `resource_with` concept verifies that a type Resource satisfies the [`synchronous_resource`](/library/api/cuda::mr::synchronous_resource) concept and also satisfies all the provided Properties."
+---
+
+C++20 concept
+
+The `resource_with` concept verifies that a type Resource satisfies the [`synchronous_resource`](/library/api/cuda::mr::synchronous_resource) concept and also satisfies all the provided Properties.
+
+
+```cpp showLineNumbers={false}
+template
+concept synchronous_resource_with = /* see description */;
+```
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/fern/cudapages/cuda/cuda/cuda/permutation_iterator.mdx b/fern/cudapages/cuda/cuda/cuda/permutation_iterator.mdx
new file mode 100644
index 0000000..1a74268
--- /dev/null
+++ b/fern/cudapages/cuda/cuda/cuda/permutation_iterator.mdx
@@ -0,0 +1,367 @@
+---
+title: "cuda::permutation_iterator"
+description: "[`permutation_iterator`](/libcudacxx/api/cuda::permutation_iterator) is an iterator which represents a pointer into a reordered view of a given range."
+---
+
+`permutation_iterator` is an iterator which represents a pointer into a reordered view of a given range.
+
+`permutation_iterator` is an imprecise name; the reordered view need not be a strict permutation. This iterator is useful for fusing a scatter or gather operation with other algorithms.
+
+This iterator takes two arguments:
+
+- an iterator to the range `V` on which the "permutation" will be applied, referred to as `iter` below
+- an iterator to a range of indices defining the reindexing scheme that determines how the elements of `V` will be permuted, referred to as `index` below
+
+Note that `permutation_iterator` is not limited to strict permutations of the given range `V`. The distance between begin and end of the reindexing iterators is allowed to be smaller compared to the size of the range `V`, in which case the `permutation_iterator` only provides a "permutation" of a subset of `V`. The indices do not need to be unique. In this same context, it must be noted that the past-the-end `permutation_iterator` is completely defined by means of the past-the-end iterator to the indices.
+
+The following code snippet demonstrates how to create a `permutation_iterator` which represents a reordering of the contents of a `device_vector`.
+
+```cpp showLineNumbers={false}
+#include
+```
+
+## Example
+
+```cpp showLineNumbers={false}
+#include
+#include
+...
+thrust::device_vector values{10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f, 70.0f, 80.0f};
+thrust::device_vector indices{2, 6, 1, 3};
+
+using ElementIterator = thrust::device_vector::iterator;
+using IndexIterator = thrust::device_vector::iterator;
+
+cuda::permutation_iterator iter(values.begin(), indices.begin());
+
+*iter; // returns 30.0f;
+iter[0]; // returns 30.0f;
+iter[1]; // returns 70.0f;
+iter[2]; // returns 20.0f;
+iter[3]; // returns 40.0f;
+
+// iter[4] is an out-of-bounds error
+
+*iter = -1.0f; // sets values[2] to -1.0f;
+iter[0] = -1.0f; // sets values[2] to -1.0f;
+iter[1] = -1.0f; // sets values[6] to -1.0f;
+iter[2] = -1.0f; // sets values[1] to -1.0f;
+iter[3] = -1.0f; // sets values[3] to -1.0f;
+
+// values is now {10, -1, -1, -1, 50, 60, -1, 80}
+```
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+
+## Constructors
+
+### permutation_iterator constexpr
+
+
+
+
+Ensure that the user passes an iterator to something interger_like.
+
+Ensure that the index [value_type](/libcudacxx/api/cuda::permutation_iterator::value_type) is convertible to [difference_type](/libcudacxx/api/cuda::permutation_iterator::difference_type) To actually use operator+ we need the index iterator to be random access To actually use operator+ we need the base iterator to be random access
+
+Default constructs an `permutation_iterator` with a value initialized iterator and index
+
+
+```cpp showLineNumbers={false}
+cuda::permutation_iterator<_Iter, _Index>::permutation_iterator() = default
+```
+
+
+
+
+
+inline noexcept
+
+Constructs an `permutation_iterator` from an iterator and an optional index.
+
+
+```cpp showLineNumbers={false}
+cuda::permutation_iterator<_Iter, _Index>::permutation_iterator(
+ _Iter __iter,
+ _Index __index
+) noexcept(::cuda::std::is_nothrow_copy_constructible_v< _Iter > &&::cuda::std::is_nothrow_copy_constructible_v< _Index >)
+```
+
+
+**Parameters**
+
+
+The iterator to to index from
+
+
+
+The iterator with the permutations
+
+
+
+
+
+---
+
+## Methods
+
+### base inline constexpr noexcept nodiscard
+
+
+
+
+Extracts the stored base iterator `iter`.
+
+
+```cpp showLineNumbers={false}
+_Iter cuda::permutation_iterator<_Iter, _Index>::base() && noexcept(::cuda::std::is_nothrow_move_constructible_v< _Iter >)
+```
+
+
+
+
+
+const
+
+Returns a const reference to the stored base iterator `iter`.
+
+
+```cpp showLineNumbers={false}
+const _Iter & cuda::permutation_iterator<_Iter, _Index>::base() const & noexcept
+```
+
+
+
+
+
+### index inline constexpr const noexcept nodiscard
+
+Returns the current index.
+
+
+```cpp showLineNumbers={false}
+difference_type cuda::permutation_iterator<_Iter, _Index>::index() const noexcept
+```
+
+
+**Returns:** Equivalent to `*index`
+
+### operator* inline constexpr noexcept nodiscard
+
+
+
+
+Dereferences the `permutation_iterator`.
+
+
+```cpp showLineNumbers={false}
+decltype(
+ auto
+) noexcept(__iter_[static_cast< __iter_difference_t >(*__index_)])
+```
+
+
+**Returns:** Equivalent to `iter[*index]`
+
+
+
+
+const
+
+Dereferences the `permutation_iterator`.
+
+
+```cpp showLineNumbers={false}
+template
+decltype(
+ auto
+) const noexcept(__iter_[static_cast< __iter_difference_t >(*__index_)])
+```
+
+
+**Returns:** Equivalent to `iter[*index]`
+
+
+
+
+### operator[] inline constexpr noexcept nodiscard
+
+
+
+
+Subscripts the `permutation_iterator` by an offset.
+
+
+```cpp showLineNumbers={false}
+decltype(
+ auto
+) noexcept(__iter_[static_cast< __iter_difference_t >(__index_[__n])])
+```
+
+
+**Returns:** Equivalent to `iter[`[`index`](/libcudacxx/api/cuda::permutation_iterator::index)`[__n]]`
+
+**Parameters**
+
+
+The additional offset
+
+
+
+
+
+const
+
+Subscripts the `permutation_iterator` by an offset.
+
+
+```cpp showLineNumbers={false}
+template
+decltype(
+ auto
+) const noexcept(__iter_[static_cast< __iter_difference_t >(__index_[__n])])
+```
+
+
+**Returns:** Equivalent to `iter[`[`index`](/libcudacxx/api/cuda::permutation_iterator::index)`[__n]]`
+
+**Parameters**
+
+
+The additional offset
+
+
+
+
+
+### operator++ inline constexpr noexcept
+
+
+
+
+Increments the `permutation_iterator`.
+
+
+```cpp showLineNumbers={false}
+permutation_iterator & cuda::permutation_iterator<_Iter, _Index>::operator++() noexcept(++__index_)
+```
+
+
+**Returns:** Equivalent to `++`[`index`](/libcudacxx/api/cuda::permutation_iterator::index)
+
+
+
+
+Increments the `permutation_iterator`.
+
+
+```cpp showLineNumbers={false}
+permutation_iterator cuda::permutation_iterator<_Iter, _Index>::operator++(
+ int
+) noexcept(noexcept(++__index_) &&::cuda::std::is_nothrow_copy_constructible_v< _Iter > &&::cuda::std::is_nothrow_copy_constructible_v< _Index >)
+```
+
+
+**Returns:** Equivalent to `index++`
+
+
+
+
+### operator-- inline constexpr noexcept
+
+
+
+
+Increments the `permutation_iterator`.
+
+
+```cpp showLineNumbers={false}
+permutation_iterator & cuda::permutation_iterator<_Iter, _Index>::operator--() noexcept(--__index_)
+```
+
+
+**Returns:** Equivalent to `--`[`index`](/libcudacxx/api/cuda::permutation_iterator::index)
+
+
+
+
+Increments the `permutation_iterator`.
+
+
+```cpp showLineNumbers={false}
+permutation_iterator cuda::permutation_iterator<_Iter, _Index>::operator--(
+ int
+) noexcept(noexcept(--__index_) &&::cuda::std::is_nothrow_copy_constructible_v< _Iter > &&::cuda::std::is_nothrow_copy_constructible_v< _Index >)
+```
+
+
+**Returns:** Equivalent to `index++`
+
+
+
+
+### operator+= inline constexpr noexcept
+
+Advances the `permutation_iterator` by a given number of elements.
+
+
+```cpp showLineNumbers={false}
+permutation_iterator & cuda::permutation_iterator<_Iter, _Index>::operator+=(
+ difference_type __n
+) noexcept(__index_+=__n)
+```
+
+
+**Returns:** Equivalent to [`index`](/libcudacxx/api/cuda::permutation_iterator::index)` + __n`
+
+**Parameters**
+
+
+The number of elements to advance
+
+
+### operator-= inline constexpr noexcept
+
+Decrements the `permutation_iterator` by a given number of elements.
+
+
+```cpp showLineNumbers={false}
+permutation_iterator & cuda::permutation_iterator<_Iter, _Index>::operator-=(
+ difference_type __n
+) noexcept(__index_ -=__n)
+```
+
+
+**Returns:** Equivalent to [`index`](/libcudacxx/api/cuda::permutation_iterator::index)` - __n`
+
+**Parameters**
+
+
+The number of elements to decrement
+
+
+---
+
+## Types
+
+### Typedefs
+
+| Name | Definition |
+|---|---|
+| `iterator_type` | `_Iter` |
+| `iterator_concept` | `::cuda::std::random_access_iterator_tag` |
+| `iterator_category` | `::cuda::std::random_access_iterator_tag` |
+| `value_type` | `::cuda::std::iter_value_t< _Iter >` |
+| `difference_type` | `::cuda::std::iter_difference_t< _Index >` |
diff --git a/fern/cudapages/cuda/cuda/cuda/pinned_memory_pool.mdx b/fern/cudapages/cuda/cuda/cuda/pinned_memory_pool.mdx
new file mode 100644
index 0000000..8e940c6
--- /dev/null
+++ b/fern/cudapages/cuda/cuda/cuda/pinned_memory_pool.mdx
@@ -0,0 +1,167 @@
+---
+title: "cuda::pinned_memory_pool"
+description: ""
+---
+
+`pinned_memory_pool` allocates pinned memory using `cudaMallocFromPoolAsync / cudaFreeAsync
+<https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY__POOLS.html>`__
+for allocation/deallocation. When constructed it creates an underlying \c cudaMemPool_t with the location type set to \c cudaMemLocationTypeHost or \c cudaMemLocationTypeHostNuma and owns it.
+
+```cpp showLineNumbers={false}
+#include
+```
+
+**Inherits from:** `cuda::pinned_memory_pool_ref` (public)
+
+---
+
+## Constructors
+
+### pinned_memory_pool inline
+
+
+
+
+Constructs a `pinned_memory_pool` with optional properties.
+
+Properties include the initial pool size and the release threshold. If the pool size grows beyond the release threshold, unused memory held by the pool will be released at the next synchronization event.
+
+
+```cpp showLineNumbers={false}
+cuda::pinned_memory_pool::pinned_memory_pool(
+ memory_pool_properties __properties = {}
+)
+```
+
+
+
+Memory from this pool is accessible from all devices right away, which differs from the default behavior of pinned memory pools where memory is not accessible from devices until `cudaMemPoolSetAccess` is called.
+
+
+**Parameters**
+
+
+Optional, additional properties of the pool to be created.
+
+
+
+
+
+Constructs a `pinned_memory_pool` with the specified NUMA node id and optional properties.
+
+Properties include the initial pool size and the release threshold. If the pool size grows beyond the release threshold, unused memory held by the pool will be released at the next synchronization event.
+
+
+```cpp showLineNumbers={false}
+cuda::pinned_memory_pool::pinned_memory_pool(
+ int __numa_id,
+ memory_pool_properties __properties = {}
+)
+```
+
+
+
+Memory from this pool is accessible from all devices right away, which differs from the default behavior of pinned memory pools where memory is not accessible from devices until `cudaMemPoolSetAccess` is called.
+
+
+**Parameters**
+
+
+The NUMA node id of the NUMA node the pool is constructed on.
+
+
+
+Optional, additional properties of the pool to be created.
+
+
+
+
+
+noexcept
+
+
+```cpp showLineNumbers={false}
+cuda::pinned_memory_pool::pinned_memory_pool(
+ ::cudaMemPool_t __pool
+) noexcept
+```
+
+
+
+
+
+
+```cpp showLineNumbers={false}
+cuda::pinned_memory_pool::pinned_memory_pool(
+ const pinned_memory_pool &
+) = delete
+```
+
+
+
+
+
+### Destructor
+
+### ~pinned_memory_pool inline noexcept
+
+
+```cpp showLineNumbers={false}
+cuda::pinned_memory_pool::~pinned_memory_pool() noexcept
+```
+
+
+---
+
+## Assignment operators
+
+### operator=
+
+
+```cpp showLineNumbers={false}
+pinned_memory_pool & cuda::pinned_memory_pool::operator=(
+ const pinned_memory_pool &
+) = delete
+```
+
+
+---
+
+## Methods
+
+### as_ref inline noexcept
+
+Returns a [`pinned_memory_pool_ref`](/libcudacxx/api/cuda::pinned_memory_pool_ref) for this `pinned_memory_pool`.
+
+We return by reference to ensure that we can subsequently convert to a resource_ref
+
+
+```cpp showLineNumbers={false}
+pinned_memory_pool_ref & cuda::pinned_memory_pool::as_ref() noexcept
+```
+
+
+---
+
+## Static methods
+
+### from_native_handle inline static noexcept
+
+
+```cpp showLineNumbers={false}
+static pinned_memory_pool cuda::pinned_memory_pool::from_native_handle(
+ ::cudaMemPool_t __pool
+) noexcept
+```
+
+
+---
+
+## Types
+
+### Typedefs
+
+| Name | Definition |
+|---|---|
+| `reference_type` | `pinned_memory_pool_ref` |
+| `default_queries` | `::cuda::mr::properties_list<::cuda::mr::device_accessible, ::cuda::mr::host_accessible >` |
diff --git a/fern/cudapages/cuda/cuda/cuda/pinned_memory_pool_ref.mdx b/fern/cudapages/cuda/cuda/cuda/pinned_memory_pool_ref.mdx
new file mode 100644
index 0000000..9355da4
--- /dev/null
+++ b/fern/cudapages/cuda/cuda/cuda/pinned_memory_pool_ref.mdx
@@ -0,0 +1,50 @@
+---
+title: "cuda::pinned_memory_pool_ref"
+description: ""
+---
+
+`pinned_memory_pool_ref` allocates pinned memory using `cudaMallocFromPoolAsync / cudaFreeAsync
+<https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY__POOLS.html>`__
+for allocation/deallocation. A `pinned_memory_pool_ref` is a thin wrapper around a \c cudaMemPool_t with the location type set to \c cudaMemLocationTypeHost or \c cudaMemLocationTypeHostNuma.
+
+.. warning::
+
+ `pinned_memory_pool_ref` does not own the pool and it is the responsibility of the user to ensure that the lifetime of the pool exceeds the lifetime of the `pinned_memory_pool_ref`.
+
+```cpp showLineNumbers={false}
+#include
+```
+
+**Inherits from:** `cuda::__memory_pool_base` (public)
+
+---
+
+## Constructors
+
+### pinned_memory_pool_ref inline explicit noexcept
+
+Constructs the `pinned_memory_pool_ref` from a `cudaMemPool_t`.
+
+
+```cpp showLineNumbers={false}
+cuda::pinned_memory_pool_ref::pinned_memory_pool_ref(
+ ::cudaMemPool_t __pool
+) noexcept
+```
+
+
+**Parameters**
+
+
+The `cudaMemPool_t` used to allocate memory.
+
+
+---
+
+## Types
+
+### Typedefs
+
+| Name | Definition |
+|---|---|
+| `default_queries` | `::cuda::mr::properties_list<::cuda::mr::device_accessible, ::cuda::mr::host_accessible >` |
diff --git a/fern/cudapages/cuda/cuda/cuda/property_with_value.mdx b/fern/cudapages/cuda/cuda/cuda/property_with_value.mdx
new file mode 100644
index 0000000..9409884
--- /dev/null
+++ b/fern/cudapages/cuda/cuda/cuda/property_with_value.mdx
@@ -0,0 +1,24 @@
+---
+title: "cuda::property_with_value"
+description: "The `property_with_value` concept verifies that a Property is stateful and signals this through the `value_type` alias."
+---
+
+C++20 concept
+
+The `property_with_value` concept verifies that a Property is stateful and signals this through the `value_type` alias.
+
+
+```cpp showLineNumbers={false}
+template
+concept property_with_value = /* see description */;
+```
+
+
+
+
+
+
+
+
+
+
diff --git a/fern/cudapages/cuda/cuda/cuda/shuffle_iterator.mdx b/fern/cudapages/cuda/cuda/cuda/shuffle_iterator.mdx
new file mode 100644
index 0000000..8fe36eb
--- /dev/null
+++ b/fern/cudapages/cuda/cuda/cuda/shuffle_iterator.mdx
@@ -0,0 +1,259 @@
+---
+title: "cuda::shuffle_iterator"
+description: "[`shuffle_iterator`](/libcudacxx/api/cuda::shuffle_iterator) is an iterator which generates a sequence of integral values representing a random permutation."
+---
+
+`shuffle_iterator` is an iterator which generates a sequence of integral values representing a random permutation.
+
+`shuffle_iterator` is an iterator which generates a sequence of values representing a random permutation. This iterator is useful for working with random permutations of a range without explicitly storing them in memory. The shuffle iterator is also useful for sampling from a range by selecting only a subset of the elements in the permutation.
+
+The following code snippet demonstrates how to create a `shuffle_iterator` which generates a random permutation of the range[0, 4)
+
+```cpp showLineNumbers={false}
+#include
+```
+
+## Example
+
+```cpp showLineNumbers={false}
+#include
+...
+// create a shuffle_iterator
+cuda::shuffle_iterator iterator{cuda::random_bijection{4, cuda::std::minstd_rand(0xDEADBEEF)}};
+// iterator[0] returns 1
+// iterator[1] returns 3
+// iterator[2] returns 2
+// iterator[3] returns 0
+```
+
+
+
+
+
+The type of the index to shuffle. Defaults to uint64_t
+
+
+
+
+
+
+
+
+---
+
+## Constructors
+
+### shuffle_iterator constexpr noexcept
+
+
+
+
+
+```cpp showLineNumbers={false}
+cuda::shuffle_iterator<_IndexType, _Bijection>::shuffle_iterator() noexcept = default
+```
+
+
+
+
+
+inline
+
+Constructs a `shuffle_iterator` from a given bijection and an optional start position.
+
+
+```cpp showLineNumbers={false}
+cuda::shuffle_iterator<_IndexType, _Bijection>::shuffle_iterator(
+ _Bijection __bijection,
+ value_type __start = 0
+) noexcept(::cuda::std::is_nothrow_move_constructible_v< _Bijection >)
+```
+
+
+**Parameters**
+
+
+The bijection representing the shuffled integer sequence
+
+
+
+The position of the iterator in the shuffled integer sequence
+
+
+
+
+
+inline explicit
+
+Constructs a `shuffle_iterator` by constructing the bijection function in place and an optional start position.
+
+
+```cpp showLineNumbers={false}
+template
+cuda::shuffle_iterator<_IndexType, _Bijection>::shuffle_iterator(
+ value_type __num_elements,
+ _RNG &&__gen,
+ value_type __start = 0
+) noexcept(::cuda::std::is_nothrow_constructible_v< _Bijection, value_type, _RNG >)
+```
+
+
+**Parameters**
+
+
+The size of the bijection sequence
+
+
+
+The random number generator to initialize the bijection
+
+
+
+The optional stating index of the `shuffle_iterator` in the bijection sequence
+
+
+
+
+
+---
+
+## Methods
+
+### operator* inline constexpr const noexcept nodiscard
+
+Dereferences the `shuffle_iterator` by invoking the bijection with the stored index.
+
+
+```cpp showLineNumbers={false}
+value_type cuda::shuffle_iterator<_IndexType, _Bijection>::operator*() const noexcept(__bijection_(0))
+```
+
+
+### operator[] inline constexpr const noexcept nodiscard
+
+Subscripts the `shuffle_iterator` by invoking the bijection with the stored index advanced by a given number of elements.
+
+
+```cpp showLineNumbers={false}
+value_type cuda::shuffle_iterator<_IndexType, _Bijection>::operator[](
+ difference_type __n
+) const noexcept(__bijection_(0))
+```
+
+
+**Parameters**
+
+
+The additional number of elements
+
+
+### operator++ inline constexpr noexcept
+
+
+
+
+Increments the [`permutation_iterator`](/libcudacxx/api/cuda::permutation_iterator).
+
+
+```cpp showLineNumbers={false}
+shuffle_iterator & cuda::shuffle_iterator<_IndexType, _Bijection>::operator++() noexcept
+```
+
+
+
+
+
+Increments the [`permutation_iterator`](/libcudacxx/api/cuda::permutation_iterator).
+
+
+```cpp showLineNumbers={false}
+shuffle_iterator cuda::shuffle_iterator<_IndexType, _Bijection>::operator++(
+ int
+) noexcept(::cuda::std::is_nothrow_copy_constructible_v< _Bijection >)
+```
+
+
+
+
+
+### operator-- inline constexpr noexcept
+
+
+
+
+Decrements the [`permutation_iterator`](/libcudacxx/api/cuda::permutation_iterator).
+
+
+```cpp showLineNumbers={false}
+shuffle_iterator & cuda::shuffle_iterator<_IndexType, _Bijection>::operator--() noexcept
+```
+
+
+
+
+
+nodiscard
+
+Decrements the [`permutation_iterator`](/libcudacxx/api/cuda::permutation_iterator).
+
+
+```cpp showLineNumbers={false}
+shuffle_iterator cuda::shuffle_iterator<_IndexType, _Bijection>::operator--(
+ int
+) noexcept(::cuda::std::is_nothrow_copy_constructible_v< _Bijection >)
+```
+
+
+
+
+
+### operator+= inline constexpr noexcept
+
+Advances the [`permutation_iterator`](/libcudacxx/api/cuda::permutation_iterator) by a given number of elements.
+
+
+```cpp showLineNumbers={false}
+shuffle_iterator & cuda::shuffle_iterator<_IndexType, _Bijection>::operator+=(
+ difference_type __n
+) noexcept
+```
+
+
+**Parameters**
+
+
+The number of elements to advance
+
+
+### operator-= inline constexpr noexcept
+
+Decrements the [`permutation_iterator`](/libcudacxx/api/cuda::permutation_iterator) by a given number of elements.
+
+
+```cpp showLineNumbers={false}
+shuffle_iterator & cuda::shuffle_iterator<_IndexType, _Bijection>::operator-=(
+ difference_type __n
+) noexcept
+```
+
+
+**Parameters**
+
+
+The number of elements to decrement
+
+
+---
+
+## Types
+
+### Typedefs
+
+| Name | Definition |
+|---|---|
+| `iterator_category` | `::cuda::std::random_access_iterator_tag` |
+| `iterator_concept` | `::cuda::std::random_access_iterator_tag` |
+| `value_type` | `_IndexType` |
+| `difference_type` | `::cuda::std::make_signed_t< value_type >` |
+| `reference` | `_IndexType` |
+| `pointer` | `void` |
diff --git a/fern/cudapages/cuda/cuda/cuda/std/pointer_traits.mdx b/fern/cudapages/cuda/cuda/cuda/std/pointer_traits.mdx
new file mode 100644
index 0000000..0ad238a
--- /dev/null
+++ b/fern/cudapages/cuda/cuda/cuda/std/pointer_traits.mdx
@@ -0,0 +1,56 @@
+---
+title: "cuda::std::pointer_traits<::cuda::heterogeneous_iterator< _Tp, _Properties... > >"
+description: ""
+---
+
+```cpp showLineNumbers={false}
+#include
+```
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+
+## Static methods
+
+### to_address inline static constexpr noexcept nodiscard
+
+Retrieve the address of the element pointed at by an [heterogeneous_iterator](/libcudacxx/api/cuda::heterogeneous_iterator).
+
+ >": "/libcudacxx/api/cuda::std::pointer_traits%3C::cuda::heterogeneous_iterator%3C _Tp, _Properties... %3E %3E"}}>
+```cpp showLineNumbers={false}
+static constexpr element_type * cuda::std::pointer_traits<::cuda::heterogeneous_iterator<_Tp, _Properties...>>::to_address(
+ const pointer __iter
+) noexcept
+```
+
+
+**Returns:** A pointer to the element pointed to by the [heterogeneous_iterator](/libcudacxx/api/cuda::heterogeneous_iterator)
+
+**Parameters**
+
+
+A [heterogeneous_iterator](/libcudacxx/api/cuda::heterogeneous_iterator).
+
+
+---
+
+## Types
+
+### Typedefs
+
+| Name | Definition |
+|---|---|
+| `pointer` | `::cuda::heterogeneous_iterator< _Tp, _Properties... >` |
+| `element_type` | `_Tp` |
+| `difference_type` | `::cuda::std::ptrdiff_t` |
diff --git a/fern/cudapages/cuda/cuda/cuda/stream.mdx b/fern/cudapages/cuda/cuda/cuda/stream.mdx
new file mode 100644
index 0000000..1865562
--- /dev/null
+++ b/fern/cudapages/cuda/cuda/cuda/stream.mdx
@@ -0,0 +1,445 @@
+---
+title: "cuda::stream"
+description: "An owning wrapper for cudaStream_t."
+---
+
+An owning wrapper for cudaStream_t.
+
+```cpp showLineNumbers={false}
+#include
+```
+
+**Inherits from:** `cuda::stream_ref` (public)
+
+---
+
+## Constructors
+
+### stream inline
+
+
+
+
+explicit
+
+Constructs a stream on a specified device and with specified priority.
+
+Priority is defaulted to [stream::default_priority](/libcudacxx/api/cuda::stream::default_priority)
+
+
+```cpp showLineNumbers={false}
+cuda::stream::stream(
+ device_ref __dev,
+ int __priority = default_priority
+)
+```
+
+
+**Throws:** `cuda_error` if stream creation fails
+
+
+
+
+explicit noexcept
+
+Construct a new `stream` object into the moved-from state.
+
+
+```cpp showLineNumbers={false}
+cuda::stream::stream(
+ no_init_t
+) noexcept
+```
+
+
+
+[`stream()`](/libcudacxx/api/cuda::stream::stream()) returns an invalid stream handle
+
+
+
+
+
+noexcept
+
+Move-construct a new `stream` object.
+
+
+```cpp showLineNumbers={false}
+cuda::stream::stream(
+ stream &&__other
+) noexcept
+```
+
+
+
+`__other` is in moved-from state.
+
+
+
+
+
+explicit
+
+
+```cpp showLineNumbers={false}
+cuda::stream::stream(
+ ::cudaStream_t __handle
+)
+```
+
+
+
+
+
+
+```cpp showLineNumbers={false}
+cuda::stream::stream(
+ const stream &
+) = delete
+```
+
+
+
+
+
+### Destructor
+
+### ~stream inline
+
+Destroy the `stream` object.
+
+
+```cpp showLineNumbers={false}
+cuda::stream::~stream()
+```
+
+
+
+If the stream fails to be destroyed, the error is silently ignored.
+
+
+---
+
+## Assignment operators
+
+### operator= inline noexcept
+
+
+
+
+Move-assign a `stream` object.
+
+
+```cpp showLineNumbers={false}
+stream & cuda::stream::operator=(
+ stream &&__other
+) noexcept
+```
+
+
+
+`__other` is in a moved-from state.
+
+
+
+
+
+
+```cpp showLineNumbers={false}
+stream & cuda::stream::operator=(
+ const stream &
+) = delete
+```
+
+
+
+
+
+---
+
+## Methods
+
+### release inline nodiscard
+
+Retrieve the native `cudaStream_t` handle and give up ownership.
+
+
+```cpp showLineNumbers={false}
+::cudaStream_t cuda::stream::release()
+```
+
+
+
+The stream object is in a moved-from state.
+
+
+**Returns:** cudaStream_t The native handle being held by the `stream` object.
+
+### get inline constexpr const noexcept nodiscard
+
+Returns the wrapped `cudaStream_t` handle.
+
+
+```cpp showLineNumbers={false}
+value_type cuda::stream::get() const noexcept
+```
+
+
+### sync inline const
+
+Synchronizes the wrapped stream.
+
+
+```cpp showLineNumbers={false}
+void cuda::stream::sync() const
+```
+
+
+**Throws:** `cuda::cuda_error` if synchronization fails.
+
+### wait inline const
+
+
+
+
+Deprecated.
+
+Use [sync()](/libcudacxx/api/cuda::stream_ref::sync()) instead.
+
+
+```cpp showLineNumbers={false}
+void cuda::stream::wait() const
+```
+
+
+
+Use [sync()](/libcudacxx/api/cuda::stream_ref::sync()) instead.
+
+
+
+
+
+Make all future work submitted into this stream depend on completion of the specified event.
+
+
+```cpp showLineNumbers={false}
+void cuda::stream::wait(
+ event_ref __ev
+) const
+```
+
+
+**Throws:** `cuda_error` if inserting the dependency fails
+
+**Parameters**
+
+
+Event that this stream should wait for
+
+
+
+
+
+Make all future work submitted into this stream depend on completion of all work from the specified stream.
+
+
+```cpp showLineNumbers={false}
+void cuda::stream::wait(
+ stream_ref __other
+) const
+```
+
+
+**Throws:** `cuda_error` if inserting the dependency fails
+
+**Parameters**
+
+
+Stream that this stream should wait for
+
+
+
+
+
+### is_done inline const nodiscard
+
+Queries if all operations on the stream have completed.
+
+
+```cpp showLineNumbers={false}
+bool cuda::stream::is_done() const
+```
+
+
+**Returns:** `true` if all operations have completed, or `false` if not.
+
+**Throws:** `cuda::cuda_error` if the query fails.
+
+### ready inline const nodiscard
+
+Queries if all operations on the wrapped stream have completed.
+
+
+```cpp showLineNumbers={false}
+bool cuda::stream::ready() const
+```
+
+
+**Returns:** `true` if all operations have completed, or `false` if not.
+
+**Throws:** `cuda::cuda_error` if the query fails.
+
+### priority inline const nodiscard
+
+Queries the priority of the wrapped stream.
+
+
+```cpp showLineNumbers={false}
+int cuda::stream::priority() const
+```
+
+
+**Returns:** value representing the priority of the wrapped stream.
+
+**Throws:** `cuda::cuda_error` if the query fails.
+
+### id inline const nodiscard
+
+Get the unique ID of the stream.
+
+Stream handles are sometimes reused, but ID is guaranteed to be unique.
+
+
+```cpp showLineNumbers={false}
+stream_id cuda::stream::id() const
+```
+
+
+**Returns:** The unique ID of the stream
+
+**Throws:** `cuda_error` if the ID query fails
+
+### record_event inline const nodiscard
+
+Create a new event and record it into this stream.
+
+
+```cpp showLineNumbers={false}
+event cuda::stream::record_event(
+ event_flags __flags = event_flags::none
+) const
+```
+
+
+**Returns:** A new event that was recorded into this stream
+
+**Throws:** `cuda_error` if event creation or record failed
+
+### record_timed_event inline const nodiscard
+
+Create a new timed event and record it into this stream.
+
+
+```cpp showLineNumbers={false}
+timed_event cuda::stream::record_timed_event(
+ event_flags __flags = event_flags::none
+) const
+```
+
+
+**Returns:** A new timed event that was recorded into this stream
+
+**Throws:** `cuda_error` if event creation or record failed
+
+### device inline const nodiscard
+
+Get device under which this stream was created.
+
+Note: In case of a stream created under a `green_context` the device on which that `green_context` was created is returned
+
+
+```cpp showLineNumbers={false}
+device_ref cuda::stream::device() const
+```
+
+
+**Throws:** `cuda_error` if device check fails
+
+### query inline constexpr const noexcept nodiscard
+
+Queries the [`stream_ref`](/libcudacxx/api/cuda::stream_ref) for itself.
+
+This makes [`stream_ref`](/libcudacxx/api/cuda::stream_ref) usable in places where we expect an environment with a [`get_stream_t`](/libcudacxx/api/cuda::get_stream_t) query
+
+
+```cpp showLineNumbers={false}
+stream_ref cuda::stream::query(
+ const ::cuda::get_stream_t &
+) const noexcept
+```
+
+
+---
+
+## Static methods
+
+### from_native_handle inline static nodiscard
+
+
+
+
+Construct an `stream` object from a native `cudaStream_t` handle.
+
+
+```cpp showLineNumbers={false}
+static stream cuda::stream::from_native_handle(
+ ::cudaStream_t __handle
+)
+```
+
+
+
+The constructed `stream` object takes ownership of the native handle.
+
+
+**Returns:** stream The constructed `stream` object
+
+**Parameters**
+
+
+The native handle
+
+
+
+
+
+The following overloads are deleted to prevent misuse:
+
+
+```cpp showLineNumbers={false}
+static stream cuda::stream::from_native_handle(int) = delete;
+static stream cuda::stream::from_native_handle(::cuda::std::nullptr_t) = delete;
+static stream cuda::stream::from_native_handle(invalid_stream_t) = delete;
+```
+
+
+
+
+
+---
+
+## Types
+
+### Typedefs
+
+| Name | Definition |
+|---|---|
+| `value_type` | `::cudaStream_t` |
+
+---
+
+## Member variables
+
+| Name | Type | Description |
+|---|---|---|
+| `default_priority` static constexpr | `int` | |
diff --git a/fern/cudapages/cuda/cuda/cuda/stream_ref.mdx b/fern/cudapages/cuda/cuda/cuda/stream_ref.mdx
new file mode 100644
index 0000000..3477f1b
--- /dev/null
+++ b/fern/cudapages/cuda/cuda/cuda/stream_ref.mdx
@@ -0,0 +1,306 @@
+---
+title: "cuda::stream_ref"
+description: "A non-owning wrapper for a `cudaStream_t`."
+---
+
+A non-owning wrapper for a `cudaStream_t`.
+
+```cpp showLineNumbers={false}
+#include
+```
+
+---
+
+## Constructors
+
+### stream_ref
+
+
+
+
+Constructs a `stream_ref` of the "default" CUDA stream.
+
+For behavior of the default stream,
+
+
+```cpp showLineNumbers={false}
+cuda::stream_ref::stream_ref() = default
+```
+
+
+**See also:**
+//! [https://docs.nvidia.com/cuda/cuda-runtime-api/stream-sync-behavior.html](https://docs.nvidia.com/cuda/cuda-runtime-api/stream-sync-behavior.html)
+
+
+
+
+
+inline constexpr noexcept
+
+Constructs a `stream_ref` from a `cudaStream_t` handle.
+
+This constructor provides implicit conversion from `cudaStream_t`.
+
+
+```cpp showLineNumbers={false}
+cuda::stream_ref::stream_ref(
+ value_type __stream_
+) noexcept
+```
+
+
+
+: It is the callers responsibility to ensure the `stream_ref` does not outlive the stream identified by the `cudaStream_t` handle.
+
+
+
+
+
+inline explicit noexcept
+
+Constructs a `stream_ref` from the [`cuda::invalid_stream_t`](/libcudacxx/api/cuda::invalid_stream_t).
+
+
+```cpp showLineNumbers={false}
+cuda::stream_ref::stream_ref(
+ invalid_stream_t
+) noexcept
+```
+
+
+
+Any CUDA APIs called on the created object will result in an CUDA error.
+
+
+
+
+
+The following overloads are deleted to prevent misuse:
+
+
+```cpp showLineNumbers={false}
+cuda::stream_ref::stream_ref(int) = delete;
+cuda::stream_ref::stream_ref(::cuda::std::nullptr_t) = delete;
+```
+
+
+
+
+
+---
+
+## Methods
+
+### get inline constexpr const noexcept nodiscard
+
+Returns the wrapped `cudaStream_t` handle.
+
+
+```cpp showLineNumbers={false}
+value_type cuda::stream_ref::get() const noexcept
+```
+
+
+### sync inline const
+
+Synchronizes the wrapped stream.
+
+
+```cpp showLineNumbers={false}
+void cuda::stream_ref::sync() const
+```
+
+
+**Throws:** `cuda::cuda_error` if synchronization fails.
+
+### wait inline const
+
+
+
+
+Deprecated.
+
+Use [sync()](/libcudacxx/api/cuda::stream_ref::sync()) instead.
+
+
+```cpp showLineNumbers={false}
+void cuda::stream_ref::wait() const
+```
+
+
+
+Use [sync()](/libcudacxx/api/cuda::stream_ref::sync()) instead.
+
+
+
+
+
+Make all future work submitted into this stream depend on completion of the specified event.
+
+
+```cpp showLineNumbers={false}
+void cuda::stream_ref::wait(
+ event_ref __ev
+) const
+```
+
+
+**Throws:** `cuda_error` if inserting the dependency fails
+
+**Parameters**
+
+
+Event that this stream should wait for
+
+
+
+
+
+Make all future work submitted into this stream depend on completion of all work from the specified stream.
+
+
+```cpp showLineNumbers={false}
+void cuda::stream_ref::wait(
+ stream_ref __other
+) const
+```
+
+
+**Throws:** `cuda_error` if inserting the dependency fails
+
+**Parameters**
+
+
+Stream that this stream should wait for
+
+
+
+
+
+### is_done inline const nodiscard
+
+Queries if all operations on the stream have completed.
+
+
+```cpp showLineNumbers={false}
+bool cuda::stream_ref::is_done() const
+```
+
+
+**Returns:** `true` if all operations have completed, or `false` if not.
+
+**Throws:** `cuda::cuda_error` if the query fails.
+
+### ready inline const nodiscard
+
+Queries if all operations on the wrapped stream have completed.
+
+
+```cpp showLineNumbers={false}
+bool cuda::stream_ref::ready() const
+```
+
+
+**Returns:** `true` if all operations have completed, or `false` if not.
+
+**Throws:** `cuda::cuda_error` if the query fails.
+
+### priority inline const nodiscard
+
+Queries the priority of the wrapped stream.
+
+
+```cpp showLineNumbers={false}
+int cuda::stream_ref::priority() const
+```
+
+
+**Returns:** value representing the priority of the wrapped stream.
+
+**Throws:** `cuda::cuda_error` if the query fails.
+
+### id inline const nodiscard
+
+Get the unique ID of the stream.
+
+Stream handles are sometimes reused, but ID is guaranteed to be unique.
+
+
+```cpp showLineNumbers={false}
+stream_id cuda::stream_ref::id() const
+```
+
+
+**Returns:** The unique ID of the stream
+
+**Throws:** `cuda_error` if the ID query fails
+
+### record_event inline const nodiscard
+
+Create a new event and record it into this stream.
+
+
+```cpp showLineNumbers={false}
+event cuda::stream_ref::record_event(
+ event_flags __flags = event_flags::none
+) const
+```
+
+
+**Returns:** A new event that was recorded into this stream
+
+**Throws:** `cuda_error` if event creation or record failed
+
+### record_timed_event inline const nodiscard
+
+Create a new timed event and record it into this stream.
+
+
+```cpp showLineNumbers={false}
+timed_event cuda::stream_ref::record_timed_event(
+ event_flags __flags = event_flags::none
+) const
+```
+
+
+**Returns:** A new timed event that was recorded into this stream
+
+**Throws:** `cuda_error` if event creation or record failed
+
+### device inline const nodiscard
+
+Get device under which this stream was created.
+
+Note: In case of a stream created under a `green_context` the device on which that `green_context` was created is returned
+
+
+```cpp showLineNumbers={false}
+device_ref cuda::stream_ref::device() const
+```
+
+
+**Throws:** `cuda_error` if device check fails
+
+### query inline constexpr const noexcept nodiscard
+
+Queries the `stream_ref` for itself.
+
+This makes `stream_ref` usable in places where we expect an environment with a [`get_stream_t`](/libcudacxx/api/cuda::get_stream_t) query
+
+
+```cpp showLineNumbers={false}
+stream_ref cuda::stream_ref::query(
+ const ::cuda::get_stream_t &
+) const noexcept
+```
+
+
+---
+
+## Types
+
+### Typedefs
+
+| Name | Definition |
+|---|---|
+| `value_type` | `::cudaStream_t` |
diff --git a/fern/cudapages/cuda/cuda/cuda/strided_iterator.mdx b/fern/cudapages/cuda/cuda/cuda/strided_iterator.mdx
new file mode 100644
index 0000000..e296c7e
--- /dev/null
+++ b/fern/cudapages/cuda/cuda/cuda/strided_iterator.mdx
@@ -0,0 +1,341 @@
+---
+title: "cuda::strided_iterator"
+description: "A [`strided_iterator`](/libcudacxx/api/cuda::strided_iterator) wraps another iterator and advances it by a specified stride each time it is incremented or decremented."
+---
+
+A `strided_iterator` wraps another iterator and advances it by a specified stride each time it is incremented or decremented.
+
+```cpp showLineNumbers={false}
+#include
+```
+
+
+
+
+
+A random access iterator
+
+
+
+Either an [integer-like](https://eel.is/c++draft/iterator.concept.winc#4) or an [integral-constant-like](https://eel.is/c++draft/views.contiguous#concept:integral-constant-like) specifying the stride
+
+
+
+
+
+---
+
+## Constructors
+
+### strided_iterator inline constexpr noexcept
+
+
+
+
+value-initializes both the base iterator and stride
+
+
+```cpp showLineNumbers={false}
+template
+cuda::strided_iterator<_Iter, _Stride>::strided_iterator() noexcept(::cuda::std::is_nothrow_default_constructible_v< _Iter2 > &&::cuda::std::is_nothrow_default_constructible_v< _Stride2 >)
+```
+
+
+
+_Iter must be default initializable because it is a random_access_iterator and thereby semiregular _Stride must be integer-like or integral_constant_like which requires default constructability
+
+
+
+
+
+explicit
+
+Constructs a `strided_iterator` from a base iterator.
+
+
+```cpp showLineNumbers={false}
+template
+cuda::strided_iterator<_Iter, _Stride>::strided_iterator(
+ _Iter __iter
+) noexcept(::cuda::std::is_nothrow_move_constructible_v< _Iter > &&::cuda::std::is_nothrow_default_constructible_v< _Stride2 >)
+```
+
+
+
+We cannot construct a `strided_iterator` with an [integer-like](https://eel.is/c++draft/iterator.concept.winc#4) stride, because that would value construct to 0 and incrementing the iterator would do nothing.
+
+
+**Parameters**
+
+
+The base iterator
+
+
+
+
+
+explicit
+
+Constructs a `strided_iterator` from a base iterator and a stride.
+
+
+```cpp showLineNumbers={false}
+cuda::strided_iterator<_Iter, _Stride>::strided_iterator(
+ _Iter __iter,
+ _Stride __stride
+) noexcept(::cuda::std::is_nothrow_move_constructible_v< _Iter > &&::cuda::std::is_nothrow_move_constructible_v< _Stride >)
+```
+
+
+**Parameters**
+
+
+The base iterator
+
+
+
+The new stride
+
+
+
+
+
+---
+
+## Methods
+
+### base inline constexpr noexcept nodiscard
+
+
+
+
+Extracts the stored iterator.
+
+
+```cpp showLineNumbers={false}
+_Iter cuda::strided_iterator<_Iter, _Stride>::base() && noexcept(::cuda::std::is_nothrow_move_constructible_v< _Iter >)
+```
+
+
+
+
+
+const
+
+Returns a const reference to the stored iterator.
+
+
+```cpp showLineNumbers={false}
+const _Iter & cuda::strided_iterator<_Iter, _Stride>::base() const & noexcept
+```
+
+
+
+
+
+### stride inline constexpr const noexcept nodiscard
+
+Returns the current stride as an integral value.
+
+
+```cpp showLineNumbers={false}
+difference_type cuda::strided_iterator<_Iter, _Stride>::stride() const noexcept(__noexcept_stride)
+```
+
+
+### operator* inline constexpr noexcept nodiscard
+
+
+
+
+Dereferences the stored base iterator.
+
+
+```cpp showLineNumbers={false}
+decltype(
+ auto
+) noexcept(*::cuda::std::declval< _Iter & >())
+```
+
+
+
+
+
+const
+
+Dereferences the stored base iterator.
+
+
+```cpp showLineNumbers={false}
+template
+decltype(
+ auto
+) const noexcept(*::cuda::std::declval< const _Iter2 & >())
+```
+
+
+
+
+
+### operator[] inline constexpr noexcept nodiscard
+
+
+
+
+Subscripts the stored base iterator with a given offset times the stride.
+
+
+```cpp showLineNumbers={false}
+decltype(
+ auto
+) noexcept(__noexcept_stride &&noexcept(::cuda::std::declval< _Iter & >()[__n]))
+```
+
+
+**Parameters**
+
+
+The offset
+
+
+
+
+
+const
+
+Subscripts the stored base iterator with a given offset times the stride.
+
+
+```cpp showLineNumbers={false}
+template
+decltype(
+ auto
+) const noexcept(__noexcept_stride &&noexcept(::cuda::std::declval< const _Iter2 & >()[__n]))
+```
+
+
+**Parameters**
+
+
+The offset
+
+
+
+
+
+### operator++ inline constexpr noexcept
+
+
+
+
+Increments the stored base iterator by the stride.
+
+
+```cpp showLineNumbers={false}
+strided_iterator & cuda::strided_iterator<_Iter, _Stride>::operator++() noexcept(__noexcept_stride &&noexcept(::cuda::std::declval< _Iter & >()+=1))
+```
+
+
+
+
+
+Increments the stored base iterator by the stride.
+
+
+```cpp showLineNumbers={false}
+auto cuda::strided_iterator<_Iter, _Stride>::operator++(
+ int
+) noexcept(noexcept(__noexcept_stride &&noexcept(::cuda::std::declval< _Iter & >()+=1)) &&::cuda::std::is_nothrow_copy_constructible_v< _Iter > &&::cuda::std::is_nothrow_copy_constructible_v< _Stride >)
+```
+
+
+
+
+
+### operator-- inline constexpr noexcept
+
+
+
+
+Decrements the stored base iterator by the stride.
+
+
+```cpp showLineNumbers={false}
+strided_iterator & cuda::strided_iterator<_Iter, _Stride>::operator--() noexcept(__noexcept_stride &&noexcept(::cuda::std::declval< _Iter & >() -=1))
+```
+
+
+
+
+
+Decrements the stored base iterator by the stride.
+
+
+```cpp showLineNumbers={false}
+strided_iterator cuda::strided_iterator<_Iter, _Stride>::operator--(
+ int
+) noexcept(noexcept(__noexcept_stride &&noexcept(::cuda::std::declval< _Iter & >() -=1)) &&::cuda::std::is_nothrow_copy_constructible_v< _Iter > &&::cuda::std::is_nothrow_copy_constructible_v< _Stride >)
+```
+
+
+
+
+
+### operator+= inline constexpr noexcept
+
+Advances a `strided_iterator` by a given number of steps.
+
+
+```cpp showLineNumbers={false}
+strided_iterator & cuda::strided_iterator<_Iter, _Stride>::operator+=(
+ difference_type __n
+) noexcept(__noexcept_stride &&noexcept(::cuda::std::declval< _Iter & >()+=1))
+```
+
+
+
+Increments the base iterator by `__n` times the stride
+
+
+**Parameters**
+
+
+The number of steps to increment
+
+
+### operator-= inline constexpr noexcept
+
+Decrements a `strided_iterator` by a given number of steps.
+
+
+```cpp showLineNumbers={false}
+strided_iterator & cuda::strided_iterator<_Iter, _Stride>::operator-=(
+ difference_type __n
+) noexcept(__noexcept_stride &&noexcept(::cuda::std::declval< _Iter & >() -=1))
+```
+
+
+
+Decrements the base iterator by `__n` times the stride
+
+
+**Parameters**
+
+
+The number of steps to decrement
+
+
+---
+
+## Types
+
+### Typedefs
+
+| Name | Definition |
+|---|---|
+| `iterator_concept` | `::cuda::std::random_access_iterator_tag` |
+| `iterator_category` | `::cuda::std::random_access_iterator_tag` |
+| `value_type` | `::cuda::std::iter_value_t< _Iter >` |
+| `difference_type` | `::cuda::std::iter_difference_t< _Iter >` |
+| `reference` | `::cuda::std::iter_reference_t< _Iter >` |
+| `pointer` | `void` |
diff --git a/fern/cudapages/cuda/cuda/cuda/tabulate_output_iterator.mdx b/fern/cudapages/cuda/cuda/cuda/tabulate_output_iterator.mdx
new file mode 100644
index 0000000..a422fac
--- /dev/null
+++ b/fern/cudapages/cuda/cuda/cuda/tabulate_output_iterator.mdx
@@ -0,0 +1,295 @@
+---
+title: "cuda::tabulate_output_iterator"
+description: "[`tabulate_output_iterator`](/libcudacxx/api/cuda::tabulate_output_iterator) is a special kind of output iterator which, whenever a value is assigned to a dereferenced iterator, calls the given callable with the index that corresponds to the offset of the dereferenced iterator and the assigned value."
+---
+
+`tabulate_output_iterator` is a special kind of output iterator which, whenever a value is assigned to a dereferenced iterator, calls the given callable with the index that corresponds to the offset of the dereferenced iterator and the assigned value.
+
+The following code snippet demonstrates how to create a `tabulate_output_iterator` which prints the index and the assigned value.
+
+```cpp showLineNumbers={false}
+#include
+```
+
+## Example
+
+```cpp showLineNumbers={false}
+#include
+
+struct print_op
+{
+ __host__ __device__ void operator()(int index, float value) const
+ {
+ printf("%d: %f\n", index, value);
+ }
+};
+
+int main()
+{
+ auto tabulate_it = cuda::make_tabulate_output_iterator(print_op{});
+
+ tabulate_it[0] = 1.0f; // prints: 0: 1.0
+ tabulate_it[1] = 3.0f; // prints: 1: 3.0
+ tabulate_it[9] = 5.0f; // prints: 9: 5.0
+}
+```
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+
+## Constructors
+
+### tabulate_output_iterator inline constexpr noexcept
+
+
+
+
+
+```cpp showLineNumbers={false}
+template
+cuda::tabulate_output_iterator<_Fn, _Index>::tabulate_output_iterator() noexcept(::cuda::std::is_nothrow_default_constructible_v< _Fn2 >)
+```
+
+
+
+
+
+Constructs a `tabulate_output_iterator` with a given functor and an optional index.
+
+
+```cpp showLineNumbers={false}
+cuda::tabulate_output_iterator<_Fn, _Index>::tabulate_output_iterator(
+ _Fn __func,
+ _Index __index = 0
+) noexcept(::cuda::std::is_nothrow_move_constructible_v< _Fn >)
+```
+
+
+**Parameters**
+
+
+The output function
+
+
+
+The position in the output sequence
+
+
+
+
+
+---
+
+## Methods
+
+### index inline constexpr const noexcept nodiscard
+
+Returns the stored index.
+
+
+```cpp showLineNumbers={false}
+difference_type cuda::tabulate_output_iterator<_Fn, _Index>::index() const noexcept
+```
+
+
+### operator* inline constexpr noexcept nodiscard
+
+
+
+
+Dereferences the `tabulate_output_iterator`.
+
+
+```cpp showLineNumbers={false}
+auto cuda::tabulate_output_iterator<_Fn, _Index>::operator*() noexcept
+```
+
+
+**Returns:** A proxy that applies the stored function and index on assignment
+
+
+
+
+const
+
+Dereferences the `tabulate_output_iterator`.
+
+
+```cpp showLineNumbers={false}
+auto cuda::tabulate_output_iterator<_Fn, _Index>::operator*() const noexcept
+```
+
+
+**Returns:** A proxy that applies the stored function and index on assignment
+
+
+
+
+### operator[] inline constexpr noexcept nodiscard
+
+
+
+
+Subscripts the `tabulate_output_iterator` with a given offset.
+
+
+```cpp showLineNumbers={false}
+auto cuda::tabulate_output_iterator<_Fn, _Index>::operator[](
+ difference_type __n
+) noexcept
+```
+
+
+**Returns:** A proxy that applies the stored function and index on assignment
+
+**Parameters**
+
+
+The additional offset to advance the stored index
+
+
+
+
+
+const
+
+Subscripts the `tabulate_output_iterator` with a given offset.
+
+
+```cpp showLineNumbers={false}
+auto cuda::tabulate_output_iterator<_Fn, _Index>::operator[](
+ difference_type __n
+) const noexcept
+```
+
+
+**Returns:** A proxy that applies the stored function and index on assignment
+
+**Parameters**
+
+
+The additional offset to advance the stored index
+
+
+
+
+
+### operator++ inline constexpr noexcept
+
+
+
+
+Increments the `tabulate_output_iterator` by incrementing the stored index.
+
+
+```cpp showLineNumbers={false}
+tabulate_output_iterator & cuda::tabulate_output_iterator<_Fn, _Index>::operator++() noexcept
+```
+
+
+
+
+
+Increments the `tabulate_output_iterator` by incrementing the stored index.
+
+
+```cpp showLineNumbers={false}
+tabulate_output_iterator cuda::tabulate_output_iterator<_Fn, _Index>::operator++(
+ int
+) noexcept(::cuda::std::is_nothrow_copy_constructible_v< _Fn >)
+```
+
+
+
+
+
+### operator-- inline constexpr noexcept
+
+
+
+
+Decrements the `tabulate_output_iterator` by decrementing the stored index.
+
+
+```cpp showLineNumbers={false}
+tabulate_output_iterator & cuda::tabulate_output_iterator<_Fn, _Index>::operator--() noexcept
+```
+
+
+
+
+
+Decrements the `tabulate_output_iterator` by decrementing the stored index.
+
+
+```cpp showLineNumbers={false}
+tabulate_output_iterator cuda::tabulate_output_iterator<_Fn, _Index>::operator--(
+ int
+) noexcept(::cuda::std::is_nothrow_copy_constructible_v< _Fn >)
+```
+
+
+
+
+
+### operator+= inline constexpr noexcept
+
+Advances the `tabulate_output_iterator` by a given number of elements.
+
+
+```cpp showLineNumbers={false}
+tabulate_output_iterator & cuda::tabulate_output_iterator<_Fn, _Index>::operator+=(
+ difference_type __n
+) noexcept
+```
+
+
+**Parameters**
+
+
+The number of elements to advance
+
+
+### operator-= inline constexpr noexcept
+
+Decrements the `tabulate_output_iterator` by a given number of elements.
+
+
+```cpp showLineNumbers={false}
+tabulate_output_iterator & cuda::tabulate_output_iterator<_Fn, _Index>::operator-=(
+ difference_type __n
+) noexcept
+```
+
+
+**Parameters**
+
+
+The number of elements to decrement
+
+
+---
+
+## Types
+
+### Typedefs
+
+| Name | Definition |
+|---|---|
+| `iterator_concept` | `::cuda::std::random_access_iterator_tag` |
+| `iterator_category` | `::cuda::std::random_access_iterator_tag` |
+| `difference_type` | `_Index` |
+| `value_type` | `void` |
+| `pointer` | `void` |
+| `reference` | `void` |
diff --git a/fern/cudapages/cuda/cuda/cuda/timed_event.mdx b/fern/cudapages/cuda/cuda/cuda/timed_event.mdx
new file mode 100644
index 0000000..0ad5cf0
--- /dev/null
+++ b/fern/cudapages/cuda/cuda/cuda/timed_event.mdx
@@ -0,0 +1,286 @@
+---
+title: "cuda::timed_event"
+description: "An owning wrapper for a `cudaEvent_t` with timing enabled."
+---
+
+An owning wrapper for a `cudaEvent_t` with timing enabled.
+
+```cpp showLineNumbers={false}
+#include
+```
+
+**Inherits from:** `cuda::event` (public)
+
+---
+
+## Constructors
+
+### timed_event
+
+
+
+
+inline explicit
+
+Construct a new `timed_event` object with the specified flags and record the event on the specified stream.
+
+
+```cpp showLineNumbers={false}
+cuda::timed_event::timed_event(
+ stream_ref __stream,
+ event_flags __flags = event_flags::none
+)
+```
+
+
+**Throws:** `cuda_error` if the event creation fails.
+
+
+
+
+inline explicit
+
+Construct a new `timed_event` object with the specified flags.
+
+The event can only be recorded on streams from the specified device.
+
+
+```cpp showLineNumbers={false}
+cuda::timed_event::timed_event(
+ device_ref __device,
+ event_flags __flags = event_flags::none
+)
+```
+
+
+**Throws:** `cuda_error` if the event creation fails.
+
+
+
+
+inline constexpr explicit noexcept
+
+Construct a new `timed_event` object into the moved-from state.
+
+
+```cpp showLineNumbers={false}
+cuda::timed_event::timed_event(
+ no_init_t
+) noexcept
+```
+
+
+
+[`get()`](/libcudacxx/api/cuda::event_ref::get()) returns `cudaEvent_t()`.
+
+
+
+
+
+noexcept
+
+
+```cpp showLineNumbers={false}
+cuda::timed_event::timed_event(
+ timed_event &&
+) noexcept = default
+```
+
+
+
+
+
+inline constexpr explicit noexcept
+
+
+```cpp showLineNumbers={false}
+cuda::timed_event::timed_event(
+ ::cudaEvent_t __evnt
+) noexcept
+```
+
+
+
+
+
+
+```cpp showLineNumbers={false}
+cuda::timed_event::timed_event(
+ const timed_event &
+) = delete
+```
+
+
+
+
+
+---
+
+## Assignment operators
+
+### operator= noexcept
+
+
+
+
+
+```cpp showLineNumbers={false}
+timed_event & cuda::timed_event::operator=(
+ timed_event &&
+) noexcept = default
+```
+
+
+
+
+
+
+```cpp showLineNumbers={false}
+timed_event & cuda::timed_event::operator=(
+ const timed_event &
+) = delete
+```
+
+
+
+
+
+---
+
+## Methods
+
+### release inline noexcept nodiscard
+
+Retrieve the native `cudaEvent_t` handle and give up ownership.
+
+
+```cpp showLineNumbers={false}
+::cudaEvent_t cuda::timed_event::release() noexcept
+```
+
+
+
+The event object is in a moved-from state.
+
+
+**Returns:** cudaEvent_t The native handle being held by the [`event`](/libcudacxx/api/cuda::event) object.
+
+### record inline const
+
+Records an event on the specified stream.
+
+
+```cpp showLineNumbers={false}
+void cuda::event_ref::record(
+ stream_ref __stream
+) const
+```
+
+
+**Throws:** `cuda_error` if the event record fails
+
+### sync inline const
+
+Synchronizes the event.
+
+
+```cpp showLineNumbers={false}
+void cuda::event_ref::sync() const
+```
+
+
+**Throws:** `cuda_error` if waiting for the event fails
+
+### is_done inline const nodiscard
+
+Checks if all the work in the stream prior to the record of the event has completed.
+
+If is_done returns true, calling [sync()](/libcudacxx/api/cuda::event_ref::sync()) on this event will return immediately
+
+
+```cpp showLineNumbers={false}
+bool cuda::event_ref::is_done() const
+```
+
+
+**Throws:** `cuda_error` if the event query fails
+
+### get inline const noexcept nodiscard
+
+Retrieve the native `cudaEvent_t` handle.
+
+
+```cpp showLineNumbers={false}
+::cudaEvent_t cuda::event_ref::get() const noexcept
+```
+
+
+**Returns:** cudaEvent_t The native handle being held by the [event_ref](/libcudacxx/api/cuda::event_ref) object.
+
+### operator bool inline constexpr explicit const noexcept nodiscard
+
+Checks if the [`event_ref`](/libcudacxx/api/cuda::event_ref) is valid.
+
+
+```cpp showLineNumbers={false}
+cuda::event_ref::operator bool() const noexcept
+```
+
+
+**Returns:** true if the [`event_ref`](/libcudacxx/api/cuda::event_ref) is valid, false otherwise.
+
+---
+
+## Static methods
+
+### from_native_handle inline static noexcept nodiscard
+
+
+
+
+Construct a `timed_event` object from a native `cudaEvent_t` handle.
+
+
+```cpp showLineNumbers={false}
+static timed_event cuda::timed_event::from_native_handle(
+ ::cudaEvent_t __evnt
+) noexcept
+```
+
+
+
+The constructed `timed_event` object takes ownership of the native handle.
+
+
+**Returns:** `timed_event` The constructed `timed_event` object
+
+**Parameters**
+
+
+The native handle
+
+
+
+
+
+The following overloads are deleted to prevent misuse:
+
+
+```cpp showLineNumbers={false}
+static timed_event cuda::timed_event::from_native_handle(int) = delete;
+static timed_event cuda::timed_event::from_native_handle(::cuda::std::nullptr_t) = delete;
+```
+
+
+
+
+
+---
+
+## Types
+
+### Typedefs
+
+| Name | Definition |
+|---|---|
+| `value_type` | `::cudaEvent_t` |
diff --git a/fern/cudapages/cuda/cuda/cuda/transform_input_output_iterator.mdx b/fern/cudapages/cuda/cuda/cuda/transform_input_output_iterator.mdx
new file mode 100644
index 0000000..5cfdf48
--- /dev/null
+++ b/fern/cudapages/cuda/cuda/cuda/transform_input_output_iterator.mdx
@@ -0,0 +1,345 @@
+---
+title: "cuda::transform_input_output_iterator"
+description: "[`transform_input_output_iterator`](/libcudacxx/api/cuda::transform_input_output_iterator) is a special kind of iterator which applies transform functions when reading from or writing to dereferenced values."
+---
+
+`transform_input_output_iterator` is a special kind of iterator which applies transform functions when reading from or writing to dereferenced values.
+
+This iterator is useful for algorithms that operate on a type that needs to be serialized/deserialized from values in another iterator, avoiding the need to materialize intermediate results in memory. This also enables the transform functions to be fused with the operations that read and write to the `transform_input_output_iterator`.
+
+The following code snippet demonstrates how to create a `transform_input_output_iterator` which performs different transformations when reading from and writing to the iterator.
+
+```cpp showLineNumbers={false}
+#include
+```
+
+## Example
+
+```cpp showLineNumbers={false}
+#include
+#include
+
+ int main()
+ {
+ const size_t size = 4;
+ thrust::device_vector v(size);
+
+ // Write 1.0f, 2.0f, 3.0f, 4.0f to vector
+ thrust::sequence(v.begin(), v.end(), 1);
+
+ // Iterator that negates read values and writes squared values
+ auto iter = cuda::make_transform_input_output_iterator(v.begin(),
+ ::cuda::std::negate{}, thrust::square{});
+
+ // Iterator negates values when reading
+ std::cout << iter[0] << " "; // -1.0f;
+ std::cout << iter[1] << " "; // -2.0f;
+ std::cout << iter[2] << " "; // -3.0f;
+ std::cout << iter[3] << "\n"; // -4.0f;
+
+ // Write 1.0f, 2.0f, 3.0f, 4.0f to iterator
+ thrust::sequence(iter, iter + size, 1);
+
+ // Values were squared before writing to vector
+ std::cout << v[0] << " "; // 1.0f;
+ std::cout << v[1] << " "; // 4.0f;
+ std::cout << v[2] << " "; // 9.0f;
+ std::cout << v[3] << "\n"; // 16.0f;
+
+ }
+```
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+
+## Constructors
+
+### transform_input_output_iterator inline constexpr noexcept
+
+
+
+
+Default constructs a `transform_input_output_iterator` with a value initialized iterator and functors.
+
+
+```cpp showLineNumbers={false}
+template
+cuda::transform_input_output_iterator<_InputFn, _OutputFn, _Iter>::transform_input_output_iterator() noexcept(::cuda::std::is_nothrow_default_constructible_v< _Iter2 > &&::cuda::std::is_nothrow_default_constructible_v< _InputFn2 > &&::cuda::std::is_nothrow_default_constructible_v< _OutputFn2 >)
+```
+
+
+
+
+
+Constructs a `transform_input_output_iterator` with base iterator, input functor and output functor.
+
+
+```cpp showLineNumbers={false}
+cuda::transform_input_output_iterator<_InputFn, _OutputFn, _Iter>::transform_input_output_iterator(
+ _Iter __iter,
+ _InputFn __input_func,
+ _OutputFn __output_func
+) noexcept(::cuda::std::is_nothrow_move_constructible_v< _Iter > &&::cuda::std::is_nothrow_move_constructible_v< _InputFn > &&::cuda::std::is_nothrow_move_constructible_v< _OutputFn >)
+```
+
+
+**Parameters**
+
+
+The iterator to transform
+
+
+
+The input functor to apply to the iterator when reading
+
+
+
+The output functor to apply to the iterator when writing
+
+
+
+
+
+---
+
+## Methods
+
+### base inline constexpr noexcept nodiscard
+
+
+
+
+Extracts the stored base iterator.
+
+
+```cpp showLineNumbers={false}
+_Iter cuda::transform_input_output_iterator<_InputFn, _OutputFn, _Iter>::base() && noexcept(::cuda::std::is_nothrow_move_constructible_v< _Iter >)
+```
+
+
+
+
+
+const
+
+Returns a const reference to the base iterator stored.
+
+
+```cpp showLineNumbers={false}
+const _Iter & cuda::transform_input_output_iterator<_InputFn, _OutputFn, _Iter>::base() const & noexcept
+```
+
+
+
+
+
+### operator* inline constexpr noexcept nodiscard
+
+
+
+
+Dereferences the `transform_input_output_iterator`.
+
+Returns a proxy that transforms values read from the stored iterator via the stored input functor and transforms assigned values via the output functor
+
+
+```cpp showLineNumbers={false}
+reference cuda::transform_input_output_iterator<_InputFn, _OutputFn, _Iter>::operator*() noexcept(::cuda::std::is_nothrow_copy_constructible_v< _Iter >)
+```
+
+
+
+
+
+const
+
+Dereferences the `transform_input_output_iterator`.
+
+Returns a proxy that transforms values read from the stored iterator via the stored input functor and transforms assigned values via the output functor
+
+
+```cpp showLineNumbers={false}
+reference cuda::transform_input_output_iterator<_InputFn, _OutputFn, _Iter>::operator*() const noexcept(::cuda::std::is_nothrow_copy_constructible_v< _Iter >)
+```
+
+
+
+
+
+### operator[] inline constexpr noexcept nodiscard
+
+
+
+
+Subscripts the `transform_input_output_iterator`.
+
+Returns a proxy that transforms values read from the stored iterator adbanvd by a given number of elements via the stored input functor and transforms assigned values via the output functor
+
+
+```cpp showLineNumbers={false}
+template
+reference cuda::transform_input_output_iterator<_InputFn, _OutputFn, _Iter>::operator[](
+ difference_type __n
+) noexcept(::cuda::std::is_nothrow_copy_constructible_v< _Iter2 > &&noexcept(::cuda::std::declval< const _Iter2 & >()+__n))
+```
+
+
+**Parameters**
+
+
+The number of elements to advance
+
+
+
+
+
+const
+
+Subscripts the `transform_input_output_iterator`.
+
+Returns a proxy that transforms values read from the stored iterator adbanvd by a given number of elements via the stored input functor and transforms assigned values via the output functor
+
+
+```cpp showLineNumbers={false}
+template
+reference cuda::transform_input_output_iterator<_InputFn, _OutputFn, _Iter>::operator[](
+ difference_type __n
+) const noexcept(::cuda::std::is_nothrow_copy_constructible_v< _Iter2 > &&noexcept(::cuda::std::declval< const _Iter2 & >()+__n))
+```
+
+
+**Parameters**
+
+
+The number of elements to advance
+
+
+
+
+
+### operator++ inline constexpr noexcept
+
+
+
+
+Increments the stored iterator.
+
+
+```cpp showLineNumbers={false}
+transform_input_output_iterator & cuda::transform_input_output_iterator<_InputFn, _OutputFn, _Iter>::operator++() noexcept(++::cuda::std::declval< _Iter & >())
+```
+
+
+
+
+
+Increments the stored iterator.
+
+
+```cpp showLineNumbers={false}
+transform_input_output_iterator cuda::transform_input_output_iterator<_InputFn, _OutputFn, _Iter>::operator++(
+ int
+) noexcept(noexcept(++::cuda::std::declval< _Iter & >()) &&::cuda::std::is_nothrow_copy_constructible_v< _Iter > &&::cuda::std::is_nothrow_copy_constructible_v< _InputFn > &&::cuda::std::is_nothrow_copy_constructible_v< _OutputFn >)
+```
+
+
+
+
+
+### operator-- inline constexpr noexcept
+
+
+
+
+Decrements the stored iterator.
+
+
+```cpp showLineNumbers={false}
+template
+transform_input_output_iterator & cuda::transform_input_output_iterator<_InputFn, _OutputFn, _Iter>::operator--() noexcept(--::cuda::std::declval< _Iter2 & >())
+```
+
+
+
+
+
+Decrements the stored iterator.
+
+
+```cpp showLineNumbers={false}
+template
+transform_input_output_iterator cuda::transform_input_output_iterator<_InputFn, _OutputFn, _Iter>::operator--(
+ int
+) noexcept(::cuda::std::is_nothrow_copy_constructible_v< _Iter > &&noexcept(--::cuda::std::declval< _Iter2 & >()))
+```
+
+
+
+
+
+### operator+= inline constexpr noexcept
+
+Advances the `transform_input_output_iterator` by a given number of elements.
+
+
+```cpp showLineNumbers={false}
+template
+transform_input_output_iterator & cuda::transform_input_output_iterator<_InputFn, _OutputFn, _Iter>::operator+=(
+ difference_type __n
+) noexcept(::cuda::std::declval< _Iter2 & >()+=__n)
+```
+
+
+**Parameters**
+
+
+The number of elements to advance
+
+
+### operator-= inline constexpr noexcept
+
+Decrements the `transform_input_output_iterator` by a given number of elements.
+
+
+```cpp showLineNumbers={false}
+template
+transform_input_output_iterator & cuda::transform_input_output_iterator<_InputFn, _OutputFn, _Iter>::operator-=(
+ difference_type __n
+) noexcept(::cuda::std::declval< _Iter2 & >() -=__n)
+```
+
+
+**Parameters**
+
+
+The number of elements to decrement
+
+
+---
+
+## Types
+
+### Typedefs
+
+| Name | Definition |
+|---|---|
+| `iterator_concept` | `::cuda::std::conditional_t< ::cuda::std::__has_random_access_traversal< _Iter >, ::cuda::std::random_access_iterator_tag, ::cuda::std::conditional_t<::cuda::std::__has_bidirectional_traversal< _Iter >, ::cuda::std::bidirectional_iterator_tag, ::cuda::std::conditional_t<::cuda::std::__has_forward_traversal< _Iter >, ::cuda::std::forward_iterator_tag, ::cuda::std::output_iterator_tag > > >` |
+| `iterator_category` | `::cuda::std::output_iterator_tag` |
+| `difference_type` | `::cuda::std::iter_difference_t< _Iter >` |
+| `value_type` | `::cuda::std::invoke_result_t< _InputFn &, ::cuda::std::iter_reference_t< _Iter > >` |
+| `pointer` | `void` |
+| `reference` | `__transform_input_output_proxy< _InputFn, _OutputFn, _Iter >` |
diff --git a/fern/cudapages/cuda/cuda/cuda/transform_iterator.mdx b/fern/cudapages/cuda/cuda/cuda/transform_iterator.mdx
new file mode 100644
index 0000000..972a6bb
--- /dev/null
+++ b/fern/cudapages/cuda/cuda/cuda/transform_iterator.mdx
@@ -0,0 +1,358 @@
+---
+title: "cuda::transform_iterator"
+description: "[`transform_iterator`](/libcudacxx/api/cuda::transform_iterator) is an iterator which represents a pointer into a range of values after transformation by a functor."
+---
+
+`transform_iterator` is an iterator which represents a pointer into a range of values after transformation by a functor.
+
+This iterator is useful for creating a range filled with the result of applying an operation to another range without either explicitly storing it in memory, or explicitly executing the transformation. Using `transform_iterator` facilitates kernel fusion by deferring the execution of a transformation until the value is needed while saving both memory capacity and bandwidth.
+
+The following code snippet demonstrates how to create a `transform_iterator` which represents the result of `sqrtf` applied to the contents of a `thrust::device_vector`.
+
+This next example demonstrates how to use a `transform_iterator` with the `thrust::reduce` functor to compute the sum of squares of a sequence. We will create temporary `transform_iterators` utilising class template argument deduction avoid explicitly specifying their type:
+
+```cpp showLineNumbers={false}
+#include
+```
+
+## Example
+
+```cpp showLineNumbers={false}
+#include
+#include
+
+struct square_root
+{
+ __host__ __device__
+ float operator()(float x) const
+ {
+ return sqrtf(x);
+ }
+};
+
+int main()
+{
+ thrust::device_vector v{1.0f, 4.0f, 9.0f, 16.0f};
+
+ using FloatIterator = thrust::device_vector::iterator;
+
+ cuda::transform_iterator iter(v.begin(), square_root{});
+
+ *iter; // returns 1.0f
+ iter[0]; // returns 1.0f;
+ iter[1]; // returns 2.0f;
+ iter[2]; // returns 3.0f;
+ iter[3]; // returns 4.0f;
+
+ // iter[4] is an out-of-bounds error
+}
+```
+
+```cpp showLineNumbers={false}
+#include
+#include
+#include
+#include
+
+struct square
+{
+ __host__ __device__
+ float operator()(float x) const
+ {
+ return x * x;
+ }
+};
+
+int main()
+{
+ // initialize a device array
+ thrust::device_vector v(4);
+ v[0] = 1.0f;
+ v[1] = 2.0f;
+ v[2] = 3.0f;
+ thrust::device_vector v{1.0f, 2.0f, 3.0f, 4.0f};
+ thrust::reduce(cuda::transform_iterator{v.begin(), square{}},
+ cuda::transform_iterator{v.end(), square{}});
+
+ std::cout << "sum of squares: " << sum_of_squares << std::endl;
+ return 0;
+}
+```
+
+
+
+
+
+
+
+
+
+
+
+
+
+**Inherits from:** `cuda::__transform_iterator_category_base< _Fn, _Iter >` (public)
+
+---
+
+## Constructors
+
+### transform_iterator inline constexpr noexcept
+
+
+
+
+Default constructs a `transform_iterator` with a value initialized iterator and functor.
+
+
+```cpp showLineNumbers={false}
+template
+cuda::transform_iterator<_Fn, _Iter>::transform_iterator() noexcept(::cuda::std::is_nothrow_default_constructible_v< _Iter2 > &&::cuda::std::is_nothrow_default_constructible_v< _Fn2 >)
+```
+
+
+
+
+
+Constructs a `transform_iterator` with a given iterator and functor.
+
+
+```cpp showLineNumbers={false}
+cuda::transform_iterator<_Fn, _Iter>::transform_iterator(
+ _Iter __iter,
+ _Fn __func
+) noexcept(::cuda::std::is_nothrow_move_constructible_v< _Iter > &&::cuda::std::is_nothrow_move_constructible_v< _Fn >)
+```
+
+
+**Parameters**
+
+
+The iterator to transform
+
+
+
+The functor to apply to the iterator
+
+
+
+
+
+---
+
+## Methods
+
+### base inline constexpr noexcept nodiscard
+
+
+
+
+Extracts the stored iterator.
+
+
+```cpp showLineNumbers={false}
+_Iter cuda::transform_iterator<_Fn, _Iter>::base() && noexcept(::cuda::std::is_nothrow_move_constructible_v< _Iter >)
+```
+
+
+
+
+
+const
+
+Returns a const reference to the stored iterator.
+
+
+```cpp showLineNumbers={false}
+const _Iter & cuda::transform_iterator<_Fn, _Iter>::base() const & noexcept
+```
+
+
+
+
+
+### operator* inline constexpr noexcept nodiscard
+
+
+
+
+Dereferences the stored iterator and applies the stored functor to the result.
+
+
+```cpp showLineNumbers={false}
+reference cuda::transform_iterator<_Fn, _Iter>::operator*() noexcept(::cuda::std::invoke(::cuda::std::declval< _Fn & >(), *::cuda::std::declval< _Iter & >()))
+```
+
+
+
+
+
+const
+
+Dereferences the stored iterator and applies the stored functor to the result.
+
+
+```cpp showLineNumbers={false}
+reference cuda::transform_iterator<_Fn, _Iter>::operator*() const noexcept(::cuda::std::invoke(::cuda::std::declval< const _Fn & >(), *::cuda::std::declval< const _Iter2 & >()))
+```
+
+
+
+
+
+### operator[] inline constexpr noexcept nodiscard
+
+
+
+
+Subscripts the stored iterator by a number of elements and applies the stored functor to the result.
+
+
+```cpp showLineNumbers={false}
+reference cuda::transform_iterator<_Fn, _Iter>::operator[](
+ difference_type __n
+) noexcept(__transform_iterator_nothrow_subscript< _Fn, _Iter2 >)
+```
+
+
+**Parameters**
+
+
+The number of elements to advance by
+
+
+
+
+
+const
+
+Subscripts the stored iterator by a number of elements and applies the stored functor to the result.
+
+
+```cpp showLineNumbers={false}
+reference cuda::transform_iterator<_Fn, _Iter>::operator[](
+ difference_type __n
+) const noexcept(__transform_iterator_nothrow_subscript< const _Fn, _Iter2 >)
+```
+
+
+**Parameters**
+
+
+The number of elements to advance by
+
+
+
+
+
+### operator++ inline constexpr noexcept
+
+
+
+
+Increments the stored iterator.
+
+
+```cpp showLineNumbers={false}
+transform_iterator & cuda::transform_iterator<_Fn, _Iter>::operator++() noexcept(++::cuda::std::declval< _Iter & >())
+```
+
+
+
+
+
+Increments the stored iterator.
+
+
+```cpp showLineNumbers={false}
+auto cuda::transform_iterator<_Fn, _Iter>::operator++(
+ int
+) noexcept(++::cuda::std::declval< _Iter & >())
+```
+
+
+
+
+
+### operator-- inline constexpr noexcept
+
+
+
+
+Decrements the stored iterator.
+
+
+```cpp showLineNumbers={false}
+template
+transform_iterator & cuda::transform_iterator<_Fn, _Iter>::operator--() noexcept(--::cuda::std::declval< _Iter2 & >())
+```
+
+
+
+
+
+Decrements the stored iterator.
+
+
+```cpp showLineNumbers={false}
+template
+transform_iterator cuda::transform_iterator<_Fn, _Iter>::operator--(
+ int
+) noexcept(::cuda::std::is_nothrow_copy_constructible_v< _Iter > &&noexcept(--::cuda::std::declval< _Iter2 & >()))
+```
+
+
+
+
+
+### operator+= inline constexpr noexcept
+
+Increments the `transform_iterator` by a given number of elements.
+
+
+```cpp showLineNumbers={false}
+template
+transform_iterator & cuda::transform_iterator<_Fn, _Iter>::operator+=(
+ difference_type __n
+) noexcept(::cuda::std::declval< _Iter2 & >()+=__n)
+```
+
+
+**Parameters**
+
+
+The number of elements to increment
+
+
+### operator-= inline constexpr noexcept
+
+Decrements the `transform_iterator` by a given number of elements.
+
+
+```cpp showLineNumbers={false}
+template
+transform_iterator & cuda::transform_iterator<_Fn, _Iter>::operator-=(
+ difference_type __n
+) noexcept(::cuda::std::declval< _Iter2 & >() -=__n)
+```
+
+
+**Parameters**
+
+
+The number of elements to decrement
+
+
+---
+
+## Types
+
+### Typedefs
+
+| Name | Definition |
+|---|---|
+| `iterator_concept` | `::cuda::std::conditional_t< ::cuda::std::__has_random_access_traversal< _Iter >, ::cuda::std::random_access_iterator_tag, ::cuda::std::conditional_t<::cuda::std::__has_bidirectional_traversal< _Iter >, ::cuda::std::bidirectional_iterator_tag, ::cuda::std::conditional_t<::cuda::std::__has_forward_traversal< _Iter >, ::cuda::std::forward_iterator_tag, ::cuda::std::input_iterator_tag > > >` |
+| `value_type` | `::cuda::std::remove_cvref_t<::cuda::std::invoke_result_t< _Fn &, ::cuda::std::iter_reference_t< _Iter > > >` |
+| `difference_type` | `::cuda::std::iter_difference_t< _Iter >` |
+| `reference` | `::cuda::std::invoke_result_t< _Fn &, ::cuda::std::iter_reference_t< _Iter > >` |
+| `pointer` | `void` |
diff --git a/fern/cudapages/cuda/cuda/cuda/transform_output_iterator.mdx b/fern/cudapages/cuda/cuda/cuda/transform_output_iterator.mdx
new file mode 100644
index 0000000..5c7b846
--- /dev/null
+++ b/fern/cudapages/cuda/cuda/cuda/transform_output_iterator.mdx
@@ -0,0 +1,331 @@
+---
+title: "cuda::transform_output_iterator"
+description: "[`transform_output_iterator`](/libcudacxx/api/cuda::transform_output_iterator) is a special kind of output iterator which transforms a value written upon dereference."
+---
+
+`transform_output_iterator` is a special kind of output iterator which transforms a value written upon dereference.
+
+This iterator is useful for transforming an output from algorithms without explicitly storing the intermediate result in the memory and applying subsequent transformation, thereby avoiding wasting memory capacity and bandwidth. Using `transform_output_iterator` facilitates kernel fusion by deferring execution of transformation until the value is written while saving both memory capacity and bandwidth.
+
+The following code snippet demonstrated how to create a `transform_output_iterator` which applies `sqrtf` to the assigning value.
+
+```cpp showLineNumbers={false}
+#include
+```
+
+## Example
+
+```cpp showLineNumbers={false}
+#include
+#include
+
+struct square_root
+{
+ __host__ __device__
+ float operator()(float x) const
+ {
+ return cuda::std::sqrtf(x);
+ }
+};
+
+int main()
+{
+ thrust::device_vector v(4);
+ cuda::transform_output_iterator iter(v.begin(), square_root());
+
+ iter[0] = 1.0f; // stores sqrtf( 1.0f)
+ iter[1] = 4.0f; // stores sqrtf( 4.0f)
+ iter[2] = 9.0f; // stores sqrtf( 9.0f)
+ iter[3] = 16.0f; // stores sqrtf(16.0f)
+ // iter[4] is an out-of-bounds error
+
+ v[0]; // returns 1.0f;
+ v[1]; // returns 2.0f;
+ v[2]; // returns 3.0f;
+ v[3]; // returns 4.0f;
+
+}
+```
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+
+## Constructors
+
+### transform_output_iterator inline constexpr noexcept
+
+
+
+
+Default constructs a `transform_output_iterator` with a value initialized iterator and functor.
+
+
+```cpp showLineNumbers={false}
+template
+cuda::transform_output_iterator<_Fn, _Iter>::transform_output_iterator() noexcept(::cuda::std::is_nothrow_default_constructible_v< _Iter2 > &&::cuda::std::is_nothrow_default_constructible_v< _Fn2 >)
+```
+
+
+
+
+
+Constructs a `transform_output_iterator` with a given iterator and output functor.
+
+
+```cpp showLineNumbers={false}
+cuda::transform_output_iterator<_Fn, _Iter>::transform_output_iterator(
+ _Iter __iter,
+ _Fn __func
+) noexcept(::cuda::std::is_nothrow_move_constructible_v< _Iter > &&::cuda::std::is_nothrow_move_constructible_v< _Fn >)
+```
+
+
+**Parameters**
+
+
+The iterator to transform
+
+
+
+The output function to apply to the iterator on assignment
+
+
+
+
+
+---
+
+## Methods
+
+### base inline constexpr noexcept nodiscard
+
+
+
+
+Extracts the stored iterator.
+
+
+```cpp showLineNumbers={false}
+_Iter cuda::transform_output_iterator<_Fn, _Iter>::base() && noexcept(::cuda::std::is_nothrow_move_constructible_v< _Iter >)
+```
+
+
+
+
+
+const
+
+Returns a const reference to the stored iterator.
+
+
+```cpp showLineNumbers={false}
+const _Iter & cuda::transform_output_iterator<_Fn, _Iter>::base() const & noexcept
+```
+
+
+
+
+
+### operator* inline constexpr noexcept nodiscard
+
+
+
+
+Returns a proxy that transforms the input upon assignment.
+
+
+```cpp showLineNumbers={false}
+auto cuda::transform_output_iterator<_Fn, _Iter>::operator*() noexcept(::cuda::std::is_nothrow_copy_constructible_v< _Iter >)
+```
+
+
+
+
+
+const
+
+Returns a proxy that transforms the input upon assignment.
+
+
+```cpp showLineNumbers={false}
+auto cuda::transform_output_iterator<_Fn, _Iter>::operator*() const noexcept(::cuda::std::is_nothrow_copy_constructible_v< _Iter >)
+```
+
+
+
+
+
+### operator[] inline constexpr noexcept nodiscard
+
+
+
+
+Subscripts the `transform_output_iterator`.
+
+
+```cpp showLineNumbers={false}
+template
+auto cuda::transform_output_iterator<_Fn, _Iter>::operator[](
+ difference_type __n
+) noexcept(::cuda::std::is_nothrow_copy_constructible_v< _Iter2 > &&noexcept(::cuda::std::declval< _Iter2 & >()+__n))
+```
+
+
+**Returns:** A proxy that transforms the input upon assignment storing the current iterator advanced by a given
+
+**Parameters**
+
+
+The number of elements to advance by
+
+
+
+
+
+const
+
+Subscripts the `transform_output_iterator`.
+
+
+```cpp showLineNumbers={false}
+template
+auto cuda::transform_output_iterator<_Fn, _Iter>::operator[](
+ difference_type __n
+) const noexcept(::cuda::std::is_nothrow_copy_constructible_v< _Iter2 > &&noexcept(::cuda::std::declval< const _Iter2 & >()+__n))
+```
+
+
+**Returns:** A proxy that transforms the input upon assignment storing the current iterator advanced by a given
+
+**Parameters**
+
+
+The number of elements to advance by
+
+
+
+
+
+### operator++ inline constexpr noexcept
+
+
+
+
+Increments the stored iterator.
+
+
+```cpp showLineNumbers={false}
+transform_output_iterator & cuda::transform_output_iterator<_Fn, _Iter>::operator++() noexcept(++::cuda::std::declval< _Iter & >())
+```
+
+
+
+
+
+Increments the stored iterator.
+
+
+```cpp showLineNumbers={false}
+auto cuda::transform_output_iterator<_Fn, _Iter>::operator++(
+ int
+) noexcept(++::cuda::std::declval< _Iter & >())
+```
+
+
+
+
+
+### operator-- inline constexpr noexcept
+
+
+
+
+Decrements the stored iterator.
+
+
+```cpp showLineNumbers={false}
+template
+transform_output_iterator & cuda::transform_output_iterator<_Fn, _Iter>::operator--() noexcept(--::cuda::std::declval< _Iter2 & >())
+```
+
+
+
+
+
+Decrements the stored iterator.
+
+
+```cpp showLineNumbers={false}
+template
+transform_output_iterator cuda::transform_output_iterator<_Fn, _Iter>::operator--(
+ int
+) noexcept(::cuda::std::is_nothrow_copy_constructible_v< _Iter > &&noexcept(--::cuda::std::declval< _Iter2 & >()))
+```
+
+
+
+
+
+### operator+= inline constexpr noexcept
+
+Increments the `transform_output_iterator` by a given number of elements.
+
+
+```cpp showLineNumbers={false}
+template
+transform_output_iterator & cuda::transform_output_iterator<_Fn, _Iter>::operator+=(
+ difference_type __n
+) noexcept(::cuda::std::declval< _Iter2 & >()+=__n)
+```
+
+
+**Parameters**
+
+
+The number of elements to increment
+
+
+### operator-= inline constexpr noexcept
+
+Decrements the `transform_output_iterator` by a given number of elements.
+
+
+```cpp showLineNumbers={false}
+template
+transform_output_iterator & cuda::transform_output_iterator<_Fn, _Iter>::operator-=(
+ difference_type __n
+) noexcept(::cuda::std::declval< _Iter2 & >() -=__n)
+```
+
+
+**Parameters**
+
+
+The number of elements to decrement
+
+
+---
+
+## Types
+
+### Typedefs
+
+| Name | Definition |
+|---|---|
+| `iterator_concept` | `::cuda::std::conditional_t< ::cuda::std::__has_random_access_traversal< _Iter >, ::cuda::std::random_access_iterator_tag, ::cuda::std::conditional_t<::cuda::std::__has_bidirectional_traversal< _Iter >, ::cuda::std::bidirectional_iterator_tag, ::cuda::std::conditional_t<::cuda::std::__has_forward_traversal< _Iter >, ::cuda::std::forward_iterator_tag, ::cuda::std::output_iterator_tag > > >` |
+| `iterator_category` | `::cuda::std::output_iterator_tag` |
+| `difference_type` | `::cuda::std::iter_difference_t< _Iter >` |
+| `value_type` | `void` |
+| `pointer` | `void` |
+| `reference` | `void` |
diff --git a/fern/cudapages/cuda/cuda/cuda/zip_function.mdx b/fern/cudapages/cuda/cuda/cuda/zip_function.mdx
new file mode 100644
index 0000000..ab5139d
--- /dev/null
+++ b/fern/cudapages/cuda/cuda/cuda/zip_function.mdx
@@ -0,0 +1,67 @@
+---
+title: "cuda::zip_function"
+description: "Adaptor that transforms a functor taking arguments of types `Ts`... into one accepting a `tuple`."
+---
+
+Adaptor that transforms a functor taking arguments of types `Ts`... into one accepting a `tuple`.
+
+```cpp showLineNumbers={false}
+#include
+```
+
+
+
+
+
+The functor to wrap
+
+
+
+
+
+---
+
+## Constructors
+
+### zip_function inline constexpr noexcept
+
+
+
+
+default construct a `zip_function`
+
+
+```cpp showLineNumbers={false}
+template
+cuda::zip_function<_Fn>::zip_function() noexcept(::cuda::std::is_nothrow_default_constructible_v< _Fn2 >)
+```
+
+
+
+
+
+construct a `zip_function` from a functor
+
+
+```cpp showLineNumbers={false}
+cuda::zip_function<_Fn>::zip_function(
+ const _Fn &__fun
+) noexcept(::cuda::std::is_nothrow_copy_constructible_v< _Fn >)
+```
+
+
+
+
+
+construct a `zip_function` from a functor
+
+
+```cpp showLineNumbers={false}
+cuda::zip_function<_Fn>::zip_function(
+ _Fn &&__fun
+) noexcept(::cuda::std::is_nothrow_move_constructible_v< _Fn >)
+```
+
+
+
+
diff --git a/fern/cudapages/cuda/cuda/cuda/zip_iterator.mdx b/fern/cudapages/cuda/cuda/cuda/zip_iterator.mdx
new file mode 100644
index 0000000..8d0cbf0
--- /dev/null
+++ b/fern/cudapages/cuda/cuda/cuda/zip_iterator.mdx
@@ -0,0 +1,332 @@
+---
+title: "cuda::zip_iterator"
+description: "[`zip_iterator`](/libcudacxx/api/cuda::zip_iterator) is an iterator which represents a `tuple` of iterators."
+---
+
+`zip_iterator` is an iterator which represents a `tuple` of iterators.
+
+This iterator is useful for creating a virtual array of structures while achieving the same performance and bandwidth as the structure of arrays idiom. `zip_iterator` also facilitates kernel fusion by providing a convenient means of amortizing the execution of the same operation over multiple ranges.
+
+The following code snippet demonstrates how to create a `zip_iterator` which represents the result of "zipping" multiple ranges together.
+
+This example shows how to use `zip_iterator` to copy multiple ranges with a single call to `thrust::copy`.
+
+```cpp showLineNumbers={false}
+#include
+```
+
+## Example
+
+```cpp showLineNumbers={false}
+#include
+#include
+
+thrust::device_vector int_v{0, 1, 2};
+thrust::device_vector float_v{0.0f, 1.0f, 2.0f};
+thrust::device_vector char_v{'a', 'b', 'c'};
+
+cuda::zip_iterator iter{int_v.begin(), float_v.begin(), char_v.begin()};
+
+*iter; // returns (0, 0.0f, 'a')
+iter[0]; // returns (0, 0.0f, 'a')
+iter[1]; // returns (1, 1.0f, 'b')
+iter[2]; // returns (2, 2.0f, 'c')
+
+cuda::std::get<0>(iter[2]); // returns 2
+cuda::std::get<1>(iter[0]); // returns 0.0f
+cuda::std::get<2>(iter[1]); // returns 'b'
+
+// iter[3] is an out-of-bounds error
+```
+
+```cpp showLineNumbers={false}
+#include
+#include
+
+int main()
+{
+ thrust::device_vector int_in{0, 1, 2}, int_out(3);
+ thrust::device_vector float_in{0.0f, 10.0f, 20.0f}, float_out(3);
+
+ thrust::copy(cuda::zip_iterator{int_in.begin(), float_in.begin()},
+ cuda::zip_iterator{int_in.end(), float_in.end()},
+ cuda::zip_iterator{int_out.begin(),float_out.begin()});
+
+ // int_out is now [0, 1, 2]
+ // float_out is now [0.0f, 10.0f, 20.0f]
+
+ return 0;
+}
+```
+
+
+
+
+
+
+
+
+
+
+**Inherits from:** `__zv_iter_category_base< _Iterators... >` (public)
+
+---
+
+## Constructors
+
+### zip_iterator
+
+
+
+
+Default-constructs a `zip_iterator` by defaulting all stored iterators.
+
+
+```cpp showLineNumbers={false}
+cuda::zip_iterator<_Iterators>::zip_iterator() = default
+```
+
+
+
+
+
+inline constexpr explicit
+
+Constructs a `zip_iterator` from a tuple of iterators.
+
+
+```cpp showLineNumbers={false}
+cuda::zip_iterator<_Iterators>::zip_iterator(
+ ::cuda::std::tuple<_Iterators...> __iters
+)
+```
+
+
+**Parameters**
+
+
+A tuple of iterators
+
+
+
+
+
+inline constexpr explicit
+
+Constructs a `zip_iterator` from a tuple of iterators.
+
+
+```cpp showLineNumbers={false}
+template
+cuda::zip_iterator<_Iterators>::zip_iterator(
+ ::cuda::std::tuple<_Iterators...> __iters
+)
+```
+
+
+**Parameters**
+
+
+A tuple of iterators
+
+
+
+
+
+inline constexpr explicit
+
+Constructs a `zip_iterator` from variadic set of iterators.
+
+
+```cpp showLineNumbers={false}
+cuda::zip_iterator<_Iterators>::zip_iterator(
+ _Iterators... __iters
+)
+```
+
+
+**Parameters**
+
+
+The input iterators
+
+
+
+
+
+inline constexpr
+
+Converts a different `zip_iterator`.
+
+
+```cpp showLineNumbers={false}
+template
+cuda::zip_iterator<_Iterators>::zip_iterator(
+ zip_iterator<_OtherIters...> __iter
+)
+```
+
+
+**Parameters**
+
+
+The other `zip_iterator`
+
+
+
+
+
+---
+
+## Methods
+
+### operator* inline constexpr const noexcept nodiscard
+
+Dereferences the `zip_iterator`.
+
+
+```cpp showLineNumbers={false}
+auto cuda::zip_iterator<_Iterators>::operator*() const noexcept(::cuda::std::apply(__zip_op_star{}, __current_))
+```
+
+
+**Returns:** A tuple of references obtained by referencing every stored iterator
+
+### operator[] inline constexpr const noexcept
+
+Subscripts the `zip_iterator` with an offset.
+
+
+```cpp showLineNumbers={false}
+template
+auto cuda::zip_iterator<_Iterators>::operator[](
+ difference_type __n
+) const noexcept(::cuda::std::apply(__zip_op_index{__n}, __current_))
+```
+
+
+**Returns:** A tuple of references obtained by subscripting every stored iterator
+
+**Parameters**
+
+
+The additional offset
+
+
+### operator++ inline constexpr
+
+
+
+
+noexcept
+
+Increments all stored iterators.
+
+
+```cpp showLineNumbers={false}
+zip_iterator & cuda::zip_iterator<_Iterators>::operator++() noexcept(::cuda::std::apply(__zip_op_increment{}, __current_))
+```
+
+
+
+
+
+Increments all stored iterators.
+
+
+```cpp showLineNumbers={false}
+auto cuda::zip_iterator<_Iterators>::operator++(
+ int
+)
+```
+
+
+**Returns:** A copy of the original `zip_iterator` if possible
+
+
+
+
+### operator-- inline constexpr
+
+
+
+
+noexcept
+
+Decrements all stored iterators.
+
+
+```cpp showLineNumbers={false}
+template
+zip_iterator & cuda::zip_iterator<_Iterators>::operator--() noexcept(::cuda::std::apply(__zip_op_decrement{}, __current_))
+```
+
+
+
+
+
+Decrements all stored iterators.
+
+
+```cpp showLineNumbers={false}
+template
+zip_iterator cuda::zip_iterator<_Iterators>::operator--(
+ int
+)
+```
+
+
+
+
+
+### operator+= inline constexpr noexcept
+
+Increments all stored iterators by a given number of elements.
+
+
+```cpp showLineNumbers={false}
+template
+zip_iterator & cuda::zip_iterator<_Iterators>::operator+=(
+ difference_type __n
+) noexcept(::cuda::std::apply(__zip_op_pe{__n}, __current_))
+```
+
+
+**Parameters**
+
+
+The number of elements to increment
+
+
+### operator-= inline constexpr noexcept
+
+Decrements all stored iterators by a given number of elements.
+
+
+```cpp showLineNumbers={false}
+template
+zip_iterator & cuda::zip_iterator<_Iterators>::operator-=(
+ difference_type __n
+) noexcept(::cuda::std::apply(__zip_op_me{__n}, __current_))
+```
+
+
+**Parameters**
+
+
+The number of elements to decrement
+
+
+---
+
+## Types
+
+### Typedefs
+
+| Name | Definition |
+|---|---|
+| `iterator_concept` | `decltype(__get_zip_iterator_concept< _Iterators... >())` |
+| `value_type` | `::cuda::std::tuple< __zip_maybe_proxy_value_type_t< _Iterators >... >` |
+| `reference` | `::cuda::std::tuple< __zip_maybe_proxy_reference_t< _Iterators >... >` |
+| `difference_type` | `::cuda::std::common_type_t<::cuda::std::iter_difference_t< _Iterators >... >` |
+| `pointer` | `void` |
diff --git a/fern/cudapages/cuda/cuda/cuda/zip_transform_iterator.mdx b/fern/cudapages/cuda/cuda/cuda/zip_transform_iterator.mdx
new file mode 100644
index 0000000..0345da2
--- /dev/null
+++ b/fern/cudapages/cuda/cuda/cuda/zip_transform_iterator.mdx
@@ -0,0 +1,307 @@
+---
+title: "cuda::zip_transform_iterator"
+description: "[`zip_transform_iterator`](/libcudacxx/api/cuda::zip_transform_iterator) is an iterator which represents the result of a transformation of a set of sequences with a given function."
+---
+
+`zip_transform_iterator` is an iterator which represents the result of a transformation of a set of sequences with a given function.
+
+This iterator is useful for creating a range filled with the result of applying an operation to another range without either explicitly storing it in memory, or explicitly executing the transformation. Using `zip_transform_iterator` facilitates kernel fusion by deferring the execution of a transformation until the value is needed while saving both memory capacity and bandwidth.
+
+`zip_transform_iterator` is morally equivalent to a combination of [transform_iterator](/libcudacxx/api/cuda::transform_iterator) and [zip_iterator](/libcudacxx/api/cuda::zip_iterator)
+
+`zip_transform_iterator` has the additional benefit that it does not require an artificial [`zip_function`](/libcudacxx/api/cuda::zip_function) to work and more importantly does not need to materialize the result of dereferencing the stored iterators when passing them to the stored function.
+
+The following code snippet demonstrates how to create a `zip_transform_iterator` which represents the result of "zipping" multiple ranges together.
+
+This example shows how to use `zip_transform_iterator` to copy multiple ranges with a single call to `thrust::copy`.
+
+```cpp showLineNumbers={false}
+#include
+```
+
+## Example
+
+```cpp showLineNumbers={false}
+template
+using zip_transform_iterator = cuda::transform_iterator, cuda::zip_function>;
+```
+
+```cpp showLineNumbers={false}
+#include
+#include
+
+struct SumArgs {
+ __host__ __device__ float operator()(float a, float b, float c) const noexcept {
+ return a + b + c;
+ }
+};
+
+thrust::device_vector A{0.f, 1.f, 2.f};
+thrust::device_vector B{1.f, 2.f, 3.f};
+thrust::device_vector C{2.f, 3.f, 4.f};
+
+cuda::zip_transform_iterator iter{SumArgs{}, A.begin(), B.begin(), C.begin()};
+
+*iter; // returns (3.f)
+iter[0]; // returns (3.f)
+iter[1]; // returns (6.f)
+iter[2]; // returns (9.f)
+// iter[3] is an out-of-bounds error
+```
+
+```cpp showLineNumbers={false}
+#include
+#include
+
+int main()
+{
+ struct SumArgs {
+ __host__ __device__ float operator()(float a, float b, float c) const noexcept {
+ return a + b + c;
+ }
+ };
+
+ thrust::device_vector