diff --git a/src/compiler/intrinsics/atomics.jl b/src/compiler/intrinsics/atomics.jl index 9c480bf..2a84539 100644 --- a/src/compiler/intrinsics/atomics.jl +++ b/src/compiler/intrinsics/atomics.jl @@ -177,3 +177,119 @@ efunc(::typeof(Intrinsics.atomic_add), effects::CC.Effects) = function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.atomic_add), args) emit_atomic_rmw!(ctx, args, AtomicADD) end + +# ============================================================================ +# Tile-indexed atomic operations +# These take pre-computed pointer tiles, value tiles, and masks. +# Used by the public API for tile-indexed atomic operations. +# ============================================================================ + +# Shared codegen helper for tile-indexed atomic RMW operations +function emit_atomic_rmw_tile!(ctx::CGCtx, args::AbstractVector, mode::AtomicRMWMode) + cb = ctx.cb + tt = ctx.tt + + # args: (ptr_tile, val, mask, memory_order, memory_scope) + ptr_tv = emit_value!(ctx, args[1]) + ptr_tv === nothing && throw(IRError("tile-indexed atomic RMW requires ptr_tile")) + val_tv = emit_value!(ctx, args[2]) + val_tv === nothing && throw(IRError("tile-indexed atomic RMW requires value")) + mask_tv = emit_value!(ctx, args[3]) + mask_tv === nothing && throw(IRError("tile-indexed atomic RMW requires mask")) + + memory_order = @something get_constant(ctx, args[4]) throw(IRError("tile-indexed atomic RMW requires constant memory_order")) + memory_scope = @something get_constant(ctx, args[5]) throw(IRError("tile-indexed atomic RMW requires constant memory_scope")) + + shape = val_tv.shape + elem_type = eltype(val_tv.jltype) + + dtype = julia_to_tile_dtype!(tt, elem_type) + result_tile_type = tile_type!(tt, dtype, collect(shape)) + token_type = Token(tt) + + # Auto-promote integer ADD to float ADD for floating-point types + actual_mode = mode + if mode == AtomicADD && elem_type <: AbstractFloat + actual_mode = AtomicADDF + end + + mem_ordering = memory_order_to_semantics(memory_order) + mem_scope = memory_scope_to_scope(memory_scope) + + old_val, new_token = encode_AtomicRMWPtrOp!(cb, result_tile_type, token_type, + ptr_tv.v, val_tv.v, actual_mode; + mask=mask_tv.v, + token=ctx.token, + memory_ordering=mem_ordering, + memory_scope=mem_scope) + ctx.token = new_token + + CGVal(old_val, result_tile_type, Tile{elem_type, Tuple{shape...}}, collect(shape)) +end + +# Tile-indexed atomic exchange +@intrinsic atomic_xchg_tile(ptr_tile, val, mask, memory_order, memory_scope) +function tfunc(𝕃, ::typeof(Intrinsics.atomic_xchg_tile), @nospecialize(ptrs), @nospecialize(val), @nospecialize args...) + CC.widenconst(val) +end +efunc(::typeof(Intrinsics.atomic_xchg_tile), effects::CC.Effects) = + CC.Effects(effects; effect_free=CC.ALWAYS_FALSE) +function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.atomic_xchg_tile), args) + emit_atomic_rmw_tile!(ctx, args, AtomicXCHG) +end + +# Tile-indexed atomic addition +@intrinsic atomic_add_tile(ptr_tile, val, mask, memory_order, memory_scope) +function tfunc(𝕃, ::typeof(Intrinsics.atomic_add_tile), @nospecialize(ptrs), @nospecialize(val), @nospecialize args...) + CC.widenconst(val) +end +efunc(::typeof(Intrinsics.atomic_add_tile), effects::CC.Effects) = + CC.Effects(effects; effect_free=CC.ALWAYS_FALSE) +function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.atomic_add_tile), args) + emit_atomic_rmw_tile!(ctx, args, AtomicADD) +end + +# Tile-indexed atomic compare-and-swap +@intrinsic atomic_cas_tile(ptr_tile, expected, desired, mask, memory_order, memory_scope) +function tfunc(𝕃, ::typeof(Intrinsics.atomic_cas_tile), @nospecialize(ptrs), @nospecialize(expected), @nospecialize args...) + CC.widenconst(expected) +end +efunc(::typeof(Intrinsics.atomic_cas_tile), effects::CC.Effects) = + CC.Effects(effects; effect_free=CC.ALWAYS_FALSE) +function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.atomic_cas_tile), args) + cb = ctx.cb + tt = ctx.tt + + # args: (ptr_tile, expected, desired, mask, memory_order, memory_scope) + ptr_tv = emit_value!(ctx, args[1]) + ptr_tv === nothing && throw(IRError("tile-indexed atomic CAS requires ptr_tile")) + expected_tv = emit_value!(ctx, args[2]) + expected_tv === nothing && throw(IRError("tile-indexed atomic CAS requires expected value")) + desired_tv = emit_value!(ctx, args[3]) + desired_tv === nothing && throw(IRError("tile-indexed atomic CAS requires desired value")) + mask_tv = emit_value!(ctx, args[4]) + mask_tv === nothing && throw(IRError("tile-indexed atomic CAS requires mask")) + + memory_order = @something get_constant(ctx, args[5]) throw(IRError("tile-indexed atomic CAS requires constant memory_order")) + memory_scope = @something get_constant(ctx, args[6]) throw(IRError("tile-indexed atomic CAS requires constant memory_scope")) + + shape = expected_tv.shape + elem_type = eltype(expected_tv.jltype) + + dtype = julia_to_tile_dtype!(tt, elem_type) + result_tile_type = tile_type!(tt, dtype, collect(shape)) + token_type = Token(tt) + + mem_ordering = memory_order_to_semantics(memory_order) + mem_scope = memory_scope_to_scope(memory_scope) + + old_val, new_token = encode_AtomicCASPtrOp!(cb, result_tile_type, token_type, + ptr_tv.v, expected_tv.v, desired_tv.v; + mask=mask_tv.v, + token=ctx.token, + memory_ordering=mem_ordering, + memory_scope=mem_scope) + ctx.token = new_token + + CGVal(old_val, result_tile_type, Tile{elem_type, Tuple{shape...}}, collect(shape)) +end diff --git a/src/language/atomics.jl b/src/language/atomics.jl index 5405449..7abc444 100644 --- a/src/language/atomics.jl +++ b/src/language/atomics.jl @@ -80,3 +80,188 @@ old_val = ct.atomic_add(counters, idx, Int32(1)) memory_scope::Int=MemScope.Device) where {T} Intrinsics.atomic_add(array, index - One(), val, memory_order, memory_scope) end + +# ============================================================================ +# Tile-indexed atomic operations +# These accept Tile indices to perform atomic operations on multiple elements. +# ============================================================================ + +# --- Pointer/mask helper (N-dimensional) --- + +@inline function _atomic_ptrs_mask(array::TileArray{T, N}, + indices::NTuple{N, Tile{<:Integer}}) where {T, N} + # Convert each index to 0-indexed + indices_0 = ntuple(Val(N)) do d + indices[d] .- one(eltype(indices[d])) + end + + # Broadcast all index tiles to a common shape + S = reduce(broadcast_shape, ntuple(d -> size(indices[d]), Val(N))) + + # Broadcast and convert to Int32 + indices_i32 = ntuple(Val(N)) do d + convert(Tile{Int32}, broadcast_to(indices_0[d], S)) + end + + # Linear index: sum(idx[d] * stride[d]) + linear_idx = reduce(.+, ntuple(Val(N)) do d + indices_i32[d] .* broadcast_to(Tile(array.strides[d]), S) + end) + + ptr_tile = Intrinsics.offset(array.ptr, linear_idx) + + # Bounds mask: 0 <= idx[d] < size[d] for all d + zero_bc = broadcast_to(Tile(Int32(0)), S) + mask = reduce(.&, ntuple(Val(N)) do d + (indices_i32[d] .>= zero_bc) .& (indices_i32[d] .< broadcast_to(Tile(size(array, d)), S)) + end) + + (ptr_tile, mask, S) +end + +# 1D convenience: single Tile -> 1-tuple +@inline function _atomic_ptrs_mask(array::TileArray{T, 1}, indices::Tile{<:Integer}) where {T} + _atomic_ptrs_mask(array, (indices,)) +end + +# --- RMW operations (atomic_add, atomic_xchg) --- + +const _ATOMIC_RMW_OPS = ( + (:add, :atomic_add_tile), + (:xchg, :atomic_xchg_tile), +) + +for (op, intrinsic) in _ATOMIC_RMW_OPS + fname = Symbol(:atomic_, op) + + # N-D with scalar value + @eval @inline function $fname(array::TileArray{T, N}, + indices::NTuple{N, Tile{<:Integer}}, val::T; + memory_order::Int=MemoryOrder.AcqRel, + memory_scope::Int=MemScope.Device) where {T, N} + ptr_tile, mask, S = _atomic_ptrs_mask(array, indices) + val_tile = broadcast_to(Tile(val), S) + Intrinsics.$intrinsic(ptr_tile, val_tile, mask, memory_order, memory_scope) + end + + # N-D with tile value + @eval @inline function $fname(array::TileArray{T, N}, + indices::NTuple{N, Tile{<:Integer}}, val::Tile{T}; + memory_order::Int=MemoryOrder.AcqRel, + memory_scope::Int=MemScope.Device) where {T, N} + ptr_tile, mask, S = _atomic_ptrs_mask(array, indices) + val_bc = broadcast_to(val, S) + Intrinsics.$intrinsic(ptr_tile, val_bc, mask, memory_order, memory_scope) + end + + # 1D convenience: single Tile index + @eval @inline function $fname(array::TileArray{T, 1}, indices::Tile{<:Integer}, val::T; + memory_order::Int=MemoryOrder.AcqRel, + memory_scope::Int=MemScope.Device) where {T} + $fname(array, (indices,), val; memory_order, memory_scope) + end + + @eval @inline function $fname(array::TileArray{T, 1}, indices::Tile{<:Integer}, val::Tile{T}; + memory_order::Int=MemoryOrder.AcqRel, + memory_scope::Int=MemScope.Device) where {T} + $fname(array, (indices,), val; memory_order, memory_scope) + end + +end + +# --- CAS operations (separate due to different signature) --- + +# N-D with scalar expected/desired +@inline function atomic_cas(array::TileArray{T, N}, + indices::NTuple{N, Tile{<:Integer}}, + expected::T, desired::T; + memory_order::Int=MemoryOrder.AcqRel, + memory_scope::Int=MemScope.Device) where {T, N} + ptr_tile, mask, S = _atomic_ptrs_mask(array, indices) + expected_tile = broadcast_to(Tile(expected), S) + desired_tile = broadcast_to(Tile(desired), S) + Intrinsics.atomic_cas_tile(ptr_tile, expected_tile, desired_tile, mask, + memory_order, memory_scope) +end + +# N-D with tile expected/desired +@inline function atomic_cas(array::TileArray{T, N}, + indices::NTuple{N, Tile{<:Integer}}, + expected::Tile{T}, desired::Tile{T}; + memory_order::Int=MemoryOrder.AcqRel, + memory_scope::Int=MemScope.Device) where {T, N} + ptr_tile, mask, S = _atomic_ptrs_mask(array, indices) + expected_bc = broadcast_to(expected, S) + desired_bc = broadcast_to(desired, S) + Intrinsics.atomic_cas_tile(ptr_tile, expected_bc, desired_bc, mask, + memory_order, memory_scope) +end + +# 1D convenience: single Tile index +@inline function atomic_cas(array::TileArray{T, 1}, indices::Tile{<:Integer}, + expected::T, desired::T; + memory_order::Int=MemoryOrder.AcqRel, + memory_scope::Int=MemScope.Device) where {T} + atomic_cas(array, (indices,), expected, desired; memory_order, memory_scope) +end + +@inline function atomic_cas(array::TileArray{T, 1}, indices::Tile{<:Integer}, + expected::Tile{T}, desired::Tile{T}; + memory_order::Int=MemoryOrder.AcqRel, + memory_scope::Int=MemScope.Device) where {T} + atomic_cas(array, (indices,), expected, desired; memory_order, memory_scope) +end + +# ============================================================================ +# Tile-space atomic operations +# These accept tile-space integer indices (like store) to atomically operate +# on contiguous tile-shaped blocks of an array. +# ============================================================================ + +# --- Pointer/mask helper for tile-space indexing --- + +@inline function _tile_space_ptrs_mask(array::TileArray{T, N}, + index::NTuple{N, Integer}, + ::Val{Shape}) where {T, N, Shape} + # Build per-dimension element index tiles (1-indexed) + # For dim d: arange [1..Shape[d]], reshaped for N-D broadcasting, plus base offset + idx_tiles = ntuple(Val(N)) do d + bcast_shape = ntuple(i -> i == d ? Shape[d] : 1, Val(N)) + base = Int32((index[d] - 1) * Shape[d]) + reshape(arange((Shape[d],), Int32), bcast_shape) .+ Tile(base) + end + + # 0-indexed linear offset: sum((idx[d] - 1) * stride[d]) + linear_idx = reduce(.+, ntuple(Val(N)) do d + (idx_tiles[d] .- Tile(Int32(1))) .* Tile(array.strides[d]) + end) + + ptr_tile = Intrinsics.offset(array.ptr, linear_idx) + + # Bounds mask: 1 <= idx[d] <= size(array, d) for all d + mask = reduce(.&, ntuple(Val(N)) do d + (idx_tiles[d] .>= Tile(Int32(1))) .& (idx_tiles[d] .<= Tile(size(array, d))) + end) + + (ptr_tile, mask) +end + +# --- Tile-space atomic_add --- + +# N-D tuple index + tile value (like store) +@inline function atomic_add(array::TileArray{T, N}, + index::NTuple{N, Integer}, tile::Tile{T}; + memory_order::Int=MemoryOrder.AcqRel, + memory_scope::Int=MemScope.Device) where {T, N} + reshaped = _reshape_to_rank(tile, Val(N)) + ptr_tile, mask = _tile_space_ptrs_mask(array, index, Val(size(reshaped))) + Intrinsics.atomic_add_tile(ptr_tile, reshaped, mask, memory_order, memory_scope) +end + +# 1D convenience (scalar index) +@inline function atomic_add(array::TileArray{T, 1}, + index::Integer, tile::Tile{T}; + memory_order::Int=MemoryOrder.AcqRel, + memory_scope::Int=MemScope.Device) where {T} + atomic_add(array, (index,), tile; memory_order, memory_scope) +end diff --git a/test/codegen/operations.jl b/test/codegen/operations.jl index 8da55a9..b030aa6 100644 --- a/test/codegen/operations.jl +++ b/test/codegen/operations.jl @@ -1418,6 +1418,96 @@ end end end + + @testset "tile-indexed atomic_cas_tko" begin + spec = ct.ArraySpec{1}(16, true) + @test @filecheck begin + @check_label "entry" + code_tiled(Tuple{ct.TileArray{Int32,1,spec}}) do arr + @check "iota" + indices = ct.arange((16,), Int) + @check "offset" + @check "atomic_cas_tko" + ct.atomic_cas(arr, indices, Int32(0), Int32(1)) + return + end + end + end + + @testset "tile-indexed 3D atomic_add" begin + spec3d = ct.ArraySpec{3}(16, true) + @test @filecheck begin + @check_label "entry" + code_tiled(Tuple{ct.TileArray{Int32,3,spec3d}}) do arr + @check "iota" + i = ct.arange((4,), Int) + j = ct.arange((4,), Int) + k = ct.arange((4,), Int) + @check "offset" + @check "atomic_rmw_tko" + ct.atomic_add(arr, (i, j, k), Int32(1)) + return + end + end + end + + @testset "tile-indexed atomic_rmw_tko" begin + spec = ct.ArraySpec{1}(16, true) + # xchg + @test @filecheck begin + @check_label "entry" + code_tiled(Tuple{ct.TileArray{Int32,1,spec}}) do arr + @check "iota" + indices = ct.arange((16,), Int) + @check "offset" + @check "atomic_rmw_tko" + ct.atomic_xchg(arr, indices, Int32(42)) + return + end + end + + # add (integer) + @test @filecheck begin + @check_label "entry" + code_tiled(Tuple{ct.TileArray{Int32,1,spec}}) do arr + @check "iota" + indices = ct.arange((16,), Int) + @check "offset" + @check "atomic_rmw_tko" + ct.atomic_add(arr, indices, Int32(1)) + return + end + end + + # add (float) + spec_f32 = ct.ArraySpec{1}(16, true) + @test @filecheck begin + @check_label "entry" + code_tiled(Tuple{ct.TileArray{Float32,1,spec_f32}}) do arr + @check "iota" + indices = ct.arange((16,), Int) + @check "offset" + @check "atomic_rmw_tko" + ct.atomic_add(arr, indices, 1.5f0) + return + end + end + end + + @testset "tile-space atomic_add" begin + spec = ct.ArraySpec{1}(16, true) + @test @filecheck begin + @check_label "entry" + code_tiled(Tuple{ct.TileArray{Int32,1,spec}, Int}) do arr, bid + @check "iota" + tile = ct.full((16,), Int32(1), Int32) + @check "offset" + @check "atomic_rmw_tko" + ct.atomic_add(arr, bid, tile) + return + end + end + end end #========================================================================= diff --git a/test/execution/atomics.jl b/test/execution/atomics.jl index 81ffe19..eebbb1e 100644 --- a/test/execution/atomics.jl +++ b/test/execution/atomics.jl @@ -166,6 +166,289 @@ end @test result == n_blocks end +# ============================================================================ +# Tile-indexed atomic operations (scatter-gather style indexing) +# ============================================================================ + +@testset "atomic_add tile-indexed 1D" begin + function atomic_add_tile_kernel(arr::ct.TileArray{Int,1}, TILE::Int) + bid = ct.bid(1) + base = (bid - 1) * TILE + indices = base .+ ct.arange((TILE,), Int) + ct.atomic_add(arr, indices, 1; + memory_order=ct.MemoryOrder.AcqRel) + return + end + + tile_size = 16 + n = 256 + n_blocks = div(n, tile_size) + arr = CUDA.zeros(Int, n) + + ct.launch(atomic_add_tile_kernel, n_blocks, arr, ct.Constant(tile_size)) + + @test all(Array(arr) .== 1) +end + +@testset "atomic_add tile-indexed returns old values" begin + function atomic_add_return_kernel(arr::ct.TileArray{Int,1}, out::ct.TileArray{Int,1}) + indices = ct.arange((16,), Int) + old_vals = ct.atomic_add(arr, indices, 1; + memory_order=ct.MemoryOrder.AcqRel) + ct.scatter(out, indices, old_vals) + return + end + + arr = CUDA.zeros(Int, 16) + out = CUDA.fill(Int(-1), 16) + + ct.launch(atomic_add_return_kernel, 1, arr, out) + + @test all(Array(out) .== 0) + @test all(Array(arr) .== 1) +end + +@testset "atomic_add tile-indexed Float32" begin + function atomic_add_f32_tile_kernel(arr::ct.TileArray{Float32,1}, TILE::Int) + bid = ct.bid(1) + base = (bid - 1) * TILE + indices = base .+ ct.arange((TILE,), Int) + ct.atomic_add(arr, indices, 1.5f0; + memory_order=ct.MemoryOrder.AcqRel) + return + end + + tile_size = 16 + n = 256 + n_blocks = div(n, tile_size) + arr = CUDA.zeros(Float32, n) + + ct.launch(atomic_add_f32_tile_kernel, n_blocks, arr, ct.Constant(tile_size)) + + @test all(isapprox.(Array(arr), 1.5f0)) +end + +@testset "atomic_add tile-indexed with tile values" begin + function atomic_add_tile_val_kernel(arr::ct.TileArray{Int,1}, + vals::ct.TileArray{Int,1}) + indices = ct.arange((16,), Int) + val_tile = ct.gather(vals, indices) + ct.atomic_add(arr, indices, val_tile; + memory_order=ct.MemoryOrder.AcqRel) + return + end + + arr = CUDA.zeros(Int, 16) + vals = CuArray(collect(Int, 1:16)) + + ct.launch(atomic_add_tile_val_kernel, 1, arr, vals) + + @test Array(arr) == collect(1:16) +end + +@testset "atomic_xchg tile-indexed" begin + function atomic_xchg_tile_kernel(arr::ct.TileArray{Int,1}) + indices = ct.arange((16,), Int) + ct.atomic_xchg(arr, indices, 42; + memory_order=ct.MemoryOrder.AcqRel) + return + end + + arr = CUDA.zeros(Int, 16) + + ct.launch(atomic_xchg_tile_kernel, 1, arr) + + @test all(Array(arr) .== 42) +end + +@testset "atomic_cas tile-indexed success" begin + function atomic_cas_tile_kernel(arr::ct.TileArray{Int,1}, out::ct.TileArray{Int,1}) + indices = ct.arange((16,), Int) + old_vals = ct.atomic_cas(arr, indices, 0, 1; + memory_order=ct.MemoryOrder.AcqRel) + ct.scatter(out, indices, old_vals) + return + end + + arr = CUDA.zeros(Int, 16) + out = CUDA.fill(Int(-1), 16) + + ct.launch(atomic_cas_tile_kernel, 1, arr, out) + + @test all(Array(out) .== 0) + @test all(Array(arr) .== 1) +end + +@testset "atomic_cas tile-indexed failure" begin + function atomic_cas_fail_kernel(arr::ct.TileArray{Int,1}, out::ct.TileArray{Int,1}) + indices = ct.arange((16,), Int) + old_vals = ct.atomic_cas(arr, indices, 0, 2; + memory_order=ct.MemoryOrder.AcqRel) + ct.scatter(out, indices, old_vals) + return + end + + arr = CUDA.fill(Int(1), 16) + out = CUDA.fill(Int(-1), 16) + + ct.launch(atomic_cas_fail_kernel, 1, arr, out) + + @test all(Array(out) .== 1) # old values returned + @test all(Array(arr) .== 1) # unchanged (CAS failed) +end + +@testset "atomic_add tile-indexed out-of-bounds" begin + function atomic_add_oob_kernel(arr::ct.TileArray{Int,1}) + # Index tile is larger than array — OOB elements should be masked + indices = ct.arange((16,), Int) + ct.atomic_add(arr, indices, 1; + memory_order=ct.MemoryOrder.AcqRel) + return + end + + arr = CUDA.zeros(Int, 8) + + ct.launch(atomic_add_oob_kernel, 1, arr) + + # Only first 8 elements should be updated + @test all(Array(arr) .== 1) +end + +@testset "atomic_add tile-indexed 3D" begin + function atomic_add_3d_kernel(arr::ct.TileArray{Int,3}) + # 3D index tiles — each is length 4, will broadcast to (4,4,4) = 64 elements + i = ct.reshape(ct.arange((4,), Int), (4, 1, 1)) + j = ct.reshape(ct.arange((4,), Int), (1, 4, 1)) + k = ct.reshape(ct.arange((4,), Int), (1, 1, 4)) + ct.atomic_add(arr, (i, j, k), 1; + memory_order=ct.MemoryOrder.AcqRel) + return + end + + arr = CUDA.zeros(Int, 4, 4, 4) + + ct.launch(atomic_add_3d_kernel, 1, arr) + + @test all(Array(arr) .== 1) +end + +# Tile-space atomic operations (block-level indexing, like store) + +@testset "atomic_add tile-space 1D" begin + function atomic_add_ts_kernel(arr::ct.TileArray{Int,1}, TILE::Int) + bid = ct.bid(1) + tile = ct.full((TILE,), 1, Int) + ct.atomic_add(arr, bid, tile) + return + end + + TILE = 16 + arr = CUDA.zeros(Int, 64) + + ct.launch(atomic_add_ts_kernel, 4, arr, ct.Constant(TILE)) + + @test all(Array(arr) .== 1) +end + +@testset "atomic_add tile-space returns old values" begin + function atomic_add_ts_old_kernel(arr::ct.TileArray{Int,1}, + out::ct.TileArray{Int,1}) + bid = ct.bid(1) + tile = ct.full((16,), 1, Int) + old = ct.atomic_add(arr, bid, tile) + ct.store(out, bid, old) + return + end + + arr = CUDA.zeros(Int, 16) + out = CUDA.fill(Int(-1), 16) + + ct.launch(atomic_add_ts_old_kernel, 1, arr, out) + + @test all(Array(out) .== 0) # old values were 0 + @test all(Array(arr) .== 1) # now incremented +end + +@testset "atomic_add tile-space 1D tuple index" begin + # Test the N-D path with a 1-tuple index (not the scalar convenience) + function atomic_add_ts_tuple1d_kernel(arr::ct.TileArray{Int,1}) + bid = ct.bid(1) + tile = ct.full((16,), 1, Int) + ct.atomic_add(arr, (bid,), tile) + return + end + + arr = CUDA.zeros(Int, 32) + + ct.launch(atomic_add_ts_tuple1d_kernel, 2, arr) + + @test all(Array(arr) .== 1) +end + +@testset "atomic_add tile-space 2D" begin + function atomic_add_ts_2d_kernel(arr::ct.TileArray{Int,2}) + bid = ct.bid(1) + tile = ct.full((4, 4), 1, Int) + ct.atomic_add(arr, (bid, Int32(1)), tile) + return + end + + arr = CUDA.zeros(Int, 4, 8) # 4 rows, 8 cols = 2 col-tiles of width 4 + + ct.launch(atomic_add_ts_2d_kernel, 1, arr) + + result = Array(arr) + @test all(result[:, 1:4] .== 1) # first col-tile updated + @test all(result[:, 5:8] .== 0) # second col-tile untouched +end + +@testset "atomic_add tile-space 2D both dims" begin + # 2 blocks: block 1 writes tile (1,1), block 2 writes tile (1,2) + function atomic_add_ts_2d_both_kernel(arr::ct.TileArray{Int,2}) + bid = ct.bid(1) + tile = ct.full((4, 4), 1, Int) + ct.atomic_add(arr, (Int32(1), bid), tile) + return + end + + arr = CUDA.zeros(Int, 4, 8) + + ct.launch(atomic_add_ts_2d_both_kernel, 2, arr) + + @test all(Array(arr) .== 1) +end + +@testset "atomic_add tile-space 3D" begin + function atomic_add_ts_3d_kernel(arr::ct.TileArray{Int,3}) + bid = ct.bid(1) + tile = ct.full((2, 2, 2), 1, Int) + ct.atomic_add(arr, (bid, Int32(1), Int32(1)), tile) + return + end + + arr = CUDA.zeros(Int, 4, 2, 2) # 2 tiles along dim 1 + + ct.launch(atomic_add_ts_3d_kernel, 2, arr) + + @test all(Array(arr) .== 1) +end + +@testset "atomic_add tile-space trailing singleton" begin + # 2D tile into a 3D array — tile should be auto-reshaped to (4, 4, 1) + function atomic_add_ts_trailing_kernel(arr::ct.TileArray{Int,3}) + bid = ct.bid(1) + tile = ct.full((4, 4), 1, Int) # 2D tile + ct.atomic_add(arr, (bid, Int32(1), Int32(1)), tile) # 3D index + return + end + + arr = CUDA.zeros(Int, 8, 4, 1) # 2 tiles along dim 1 + + ct.launch(atomic_add_ts_trailing_kernel, 2, arr) + + @test all(Array(arr) .== 1) +end + @testset "1D gather - simple" begin # Simple 1D gather: copy first 16 elements using gather function gather_simple_kernel(src::ct.TileArray{Float32,1}, dst::ct.TileArray{Float32,1})