modular/examples/mojo/gpu-functions/reduction.mojo at main · HROSdev/modular · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
# ===----------------------------------------------------------------------=== #
# Copyright (c) 2025, Modular Inc. All rights reserved.
#
# Licensed under the Apache License v2.0 with LLVM Exceptions:
# https://llvm.org/LICENSE.txt
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ===----------------------------------------------------------------------=== #

from benchmark import (
    Bench,
    BenchConfig,
    Bencher,
    BenchId,
    ThroughputMeasure,
    BenchMetric,
)
from bit import log2_floor
from math import ceildiv
from memory import stack_allocation
from os.atomic import Atomic
from random import randint
from sys import has_accelerator, sizeof
from testing import assert_equal

from gpu import thread_idx, block_idx, block_dim, grid_dim, warp, barrier
from gpu.host import DeviceContext
from gpu.memory import AddressSpace, load

# Initialize parameters
# To achieve high bandwidth increase SIZE to large value
alias TPB = 512
alias LOG_TPB = log2_floor(TPB)
alias BATCH_SIZE = 8  # needs to be power of 2
alias SIZE = 1 << 29
alias NUM_BLOCKS = ceildiv(SIZE, TPB * BATCH_SIZE)
alias WARP_SIZE = 32
alias dtype = DType.int32


fn sum_kernel[
    size: Int, batch_size: Int
](output: UnsafePointer[Int32], a: UnsafePointer[Int32],):
    """Efficient reduction of the vector a."""
    sums = stack_allocation[
        TPB,
        Scalar[dtype],
        address_space = AddressSpace.SHARED,
    ]()
    global_tid = block_idx.x * block_dim.x + thread_idx.x
    tid = thread_idx.x
    threads_in_grid = TPB * grid_dim.x
    var sum: Int32 = 0

    for i in range(global_tid, size, threads_in_grid):
        idx = i * batch_size
        # Load in a vectorized fashion and reduce the loaded SIMD vector
        if idx < size:
            sum += load[width=batch_size](a, idx).reduce_add()
    sums[tid] = sum
    barrier()

    # Reduce until the first warp
    active_threads = TPB

    @parameter
    for power in range(1, LOG_TPB - 4):
        active_threads >>= 1
        if tid < active_threads:
            sums[tid] += sums[tid + active_threads]
        barrier()

    # Reduce the warp and accumulate via atomic addition
    if tid < WARP_SIZE:
        var warp_sum: Int32 = sums[tid][0]
        warp_sum = warp.sum(warp_sum)

        if tid == 0:
            _ = Atomic.fetch_add(output, warp_sum)


# Benchmark function for sum_kernel
@parameter
@always_inline
fn sum_kernel_benchmark(
    mut b: Bencher, input_data: (UnsafePointer[Int32], UnsafePointer[Int32])
) capturing raises:
    @parameter
    @always_inline
    fn kernel_launch_sum(ctx: DeviceContext) raises:
        var out_ptr = input_data[0]
        var a_ptr = input_data[1]
        ctx.enqueue_function[sum_kernel[SIZE, BATCH_SIZE]](
            out_ptr,
            a_ptr,
            grid_dim=NUM_BLOCKS,
            block_dim=TPB,
        )

    var bench_ctx = DeviceContext()
    b.iter_custom[kernel_launch_sum](bench_ctx)


def main():
    constrained[
        has_accelerator(),
        "This example requires a supported GPU",
    ]()

    with DeviceContext() as ctx:
        # Allocate memory on the device
        out = ctx.enqueue_create_buffer[dtype](1).enqueue_fill(0)
        a = ctx.enqueue_create_buffer[dtype](SIZE).enqueue_fill(0)

        # Initialise a with random integers between 0 and 10
        with a.map_to_host() as a_host:
            randint[dtype](a_host.unsafe_ptr(), SIZE, 0, 10)

        # Get unsafe pointers to device
        out_ptr = out.unsafe_ptr()
        a_ptr = a.unsafe_ptr()

        # Call the kernel
        ctx.enqueue_function[sum_kernel[SIZE, BATCH_SIZE]](
            out_ptr,
            a_ptr,
            grid_dim=NUM_BLOCKS,
            block_dim=TPB,
        )
        ctx.synchronize()

        # Calculate the sum in a sequential fashion on the host
        # for correctness check
        expected = ctx.enqueue_create_host_buffer[dtype](1).enqueue_fill(0)
        with a.map_to_host() as a_host:
            for i in range(SIZE):
                expected[0] += a_host[i]

        # Assert the correctness of the kernel
        with out.map_to_host() as out_host:
            print("out:", out_host)
            print("expected:", expected)
            assert_equal(out_host[0], expected[0])

        # Benchmark performance
        var bench = Bench(BenchConfig(max_iters=50000))
        bench.bench_with_input[
            (UnsafePointer[Int32], UnsafePointer[Int32]), sum_kernel_benchmark
        ](
            BenchId("sum_kernel_benchmark", "gpu"),
            (out_ptr, a_ptr),
            ThroughputMeasure(BenchMetric.bytes, SIZE * sizeof[dtype]()),
        )
        # Pretty print in table format
        print(bench)