From 6811a6035bf9cc4a04d1eaab61e91d0e3f8a1506 Mon Sep 17 00:00:00 2001 From: AntonOresten Date: Mon, 23 Feb 2026 11:35:15 +0100 Subject: [PATCH 1/5] Add tile-indexed methods for existing atomic operations --- src/compiler/intrinsics/atomics.jl | 116 ++++++++++++++++++++++ src/language/atomics.jl | 147 ++++++++++++++++++++++++++++ test/codegen/operations.jl | 58 +++++++++++ test/execution/atomics.jl | 148 +++++++++++++++++++++++++++++ 4 files changed, 469 insertions(+) diff --git a/src/compiler/intrinsics/atomics.jl b/src/compiler/intrinsics/atomics.jl index 9c480bf..2a84539 100644 --- a/src/compiler/intrinsics/atomics.jl +++ b/src/compiler/intrinsics/atomics.jl @@ -177,3 +177,119 @@ efunc(::typeof(Intrinsics.atomic_add), effects::CC.Effects) = function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.atomic_add), args) emit_atomic_rmw!(ctx, args, AtomicADD) end + +# ============================================================================ +# Tile-indexed atomic operations +# These take pre-computed pointer tiles, value tiles, and masks. +# Used by the public API for tile-indexed atomic operations. +# ============================================================================ + +# Shared codegen helper for tile-indexed atomic RMW operations +function emit_atomic_rmw_tile!(ctx::CGCtx, args::AbstractVector, mode::AtomicRMWMode) + cb = ctx.cb + tt = ctx.tt + + # args: (ptr_tile, val, mask, memory_order, memory_scope) + ptr_tv = emit_value!(ctx, args[1]) + ptr_tv === nothing && throw(IRError("tile-indexed atomic RMW requires ptr_tile")) + val_tv = emit_value!(ctx, args[2]) + val_tv === nothing && throw(IRError("tile-indexed atomic RMW requires value")) + mask_tv = emit_value!(ctx, args[3]) + mask_tv === nothing && throw(IRError("tile-indexed atomic RMW requires mask")) + + memory_order = @something get_constant(ctx, args[4]) throw(IRError("tile-indexed atomic RMW requires constant memory_order")) + memory_scope = @something get_constant(ctx, args[5]) throw(IRError("tile-indexed atomic RMW requires constant memory_scope")) + + shape = val_tv.shape + elem_type = eltype(val_tv.jltype) + + dtype = julia_to_tile_dtype!(tt, elem_type) + result_tile_type = tile_type!(tt, dtype, collect(shape)) + token_type = Token(tt) + + # Auto-promote integer ADD to float ADD for floating-point types + actual_mode = mode + if mode == AtomicADD && elem_type <: AbstractFloat + actual_mode = AtomicADDF + end + + mem_ordering = memory_order_to_semantics(memory_order) + mem_scope = memory_scope_to_scope(memory_scope) + + old_val, new_token = encode_AtomicRMWPtrOp!(cb, result_tile_type, token_type, + ptr_tv.v, val_tv.v, actual_mode; + mask=mask_tv.v, + token=ctx.token, + memory_ordering=mem_ordering, + memory_scope=mem_scope) + ctx.token = new_token + + CGVal(old_val, result_tile_type, Tile{elem_type, Tuple{shape...}}, collect(shape)) +end + +# Tile-indexed atomic exchange +@intrinsic atomic_xchg_tile(ptr_tile, val, mask, memory_order, memory_scope) +function tfunc(𝕃, ::typeof(Intrinsics.atomic_xchg_tile), @nospecialize(ptrs), @nospecialize(val), @nospecialize args...) + CC.widenconst(val) +end +efunc(::typeof(Intrinsics.atomic_xchg_tile), effects::CC.Effects) = + CC.Effects(effects; effect_free=CC.ALWAYS_FALSE) +function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.atomic_xchg_tile), args) + emit_atomic_rmw_tile!(ctx, args, AtomicXCHG) +end + +# Tile-indexed atomic addition +@intrinsic atomic_add_tile(ptr_tile, val, mask, memory_order, memory_scope) +function tfunc(𝕃, ::typeof(Intrinsics.atomic_add_tile), @nospecialize(ptrs), @nospecialize(val), @nospecialize args...) + CC.widenconst(val) +end +efunc(::typeof(Intrinsics.atomic_add_tile), effects::CC.Effects) = + CC.Effects(effects; effect_free=CC.ALWAYS_FALSE) +function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.atomic_add_tile), args) + emit_atomic_rmw_tile!(ctx, args, AtomicADD) +end + +# Tile-indexed atomic compare-and-swap +@intrinsic atomic_cas_tile(ptr_tile, expected, desired, mask, memory_order, memory_scope) +function tfunc(𝕃, ::typeof(Intrinsics.atomic_cas_tile), @nospecialize(ptrs), @nospecialize(expected), @nospecialize args...) + CC.widenconst(expected) +end +efunc(::typeof(Intrinsics.atomic_cas_tile), effects::CC.Effects) = + CC.Effects(effects; effect_free=CC.ALWAYS_FALSE) +function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.atomic_cas_tile), args) + cb = ctx.cb + tt = ctx.tt + + # args: (ptr_tile, expected, desired, mask, memory_order, memory_scope) + ptr_tv = emit_value!(ctx, args[1]) + ptr_tv === nothing && throw(IRError("tile-indexed atomic CAS requires ptr_tile")) + expected_tv = emit_value!(ctx, args[2]) + expected_tv === nothing && throw(IRError("tile-indexed atomic CAS requires expected value")) + desired_tv = emit_value!(ctx, args[3]) + desired_tv === nothing && throw(IRError("tile-indexed atomic CAS requires desired value")) + mask_tv = emit_value!(ctx, args[4]) + mask_tv === nothing && throw(IRError("tile-indexed atomic CAS requires mask")) + + memory_order = @something get_constant(ctx, args[5]) throw(IRError("tile-indexed atomic CAS requires constant memory_order")) + memory_scope = @something get_constant(ctx, args[6]) throw(IRError("tile-indexed atomic CAS requires constant memory_scope")) + + shape = expected_tv.shape + elem_type = eltype(expected_tv.jltype) + + dtype = julia_to_tile_dtype!(tt, elem_type) + result_tile_type = tile_type!(tt, dtype, collect(shape)) + token_type = Token(tt) + + mem_ordering = memory_order_to_semantics(memory_order) + mem_scope = memory_scope_to_scope(memory_scope) + + old_val, new_token = encode_AtomicCASPtrOp!(cb, result_tile_type, token_type, + ptr_tv.v, expected_tv.v, desired_tv.v; + mask=mask_tv.v, + token=ctx.token, + memory_ordering=mem_ordering, + memory_scope=mem_scope) + ctx.token = new_token + + CGVal(old_val, result_tile_type, Tile{elem_type, Tuple{shape...}}, collect(shape)) +end diff --git a/src/language/atomics.jl b/src/language/atomics.jl index 5405449..c3ee839 100644 --- a/src/language/atomics.jl +++ b/src/language/atomics.jl @@ -80,3 +80,150 @@ old_val = ct.atomic_add(counters, idx, Int32(1)) memory_scope::Int=MemScope.Device) where {T} Intrinsics.atomic_add(array, index - One(), val, memory_order, memory_scope) end + +# ============================================================================ +# Tile-indexed atomic operations (scatter-gather style indexing) +# These accept Tile indices to perform atomic operations on multiple elements. +# ============================================================================ + +# --- Pointer/mask helpers (same pattern as gather/scatter in operations.jl) --- + +@inline function _atomic_ptrs_mask(array::TileArray{T, 1}, indices::Tile{I}) where {T, I <: Integer} + indices_0 = indices .- one(I) + indices_i32 = convert(Tile{Int32}, indices_0) + ptr_tile = Intrinsics.offset(array.ptr, indices_i32) + zero_0d = Tile(Int32(0)) + size_0d = Tile(size(array, 1)) + mask = (indices_i32 .>= zero_0d) .& (indices_i32 .< size_0d) + (ptr_tile, mask, size(indices)) +end + +@inline function _atomic_ptrs_mask(array::TileArray{T, 2}, + indices::Tuple{Tile{I0}, Tile{I1}}) where {T, I0 <: Integer, I1 <: Integer} + idx0_0 = indices[1] .- one(I0) + idx1_0 = indices[2] .- one(I1) + + S = broadcast_shape(size(indices[1]), size(indices[2])) + idx0_bc = broadcast_to(idx0_0, S) + idx1_bc = broadcast_to(idx1_0, S) + + idx0_i32 = convert(Tile{Int32}, idx0_bc) + idx1_i32 = convert(Tile{Int32}, idx1_bc) + + stride0_0d = Tile(array.strides[1]) + stride1_0d = Tile(array.strides[2]) + stride0 = broadcast_to(stride0_0d, S) + stride1 = broadcast_to(stride1_0d, S) + + linear_idx = idx0_i32 .* stride0 + idx1_i32 .* stride1 + ptr_tile = Intrinsics.offset(array.ptr, linear_idx) + + zero_0d = Tile(Int32(0)) + zero_bc = broadcast_to(zero_0d, S) + size0_bc = broadcast_to(Tile(size(array, 1)), S) + size1_bc = broadcast_to(Tile(size(array, 2)), S) + + mask0 = (idx0_i32 .>= zero_bc) .& (idx0_i32 .< size0_bc) + mask1 = (idx1_i32 .>= zero_bc) .& (idx1_i32 .< size1_bc) + mask = mask0 .& mask1 + + (ptr_tile, mask, S) +end + +# --- RMW operations (atomic_add, atomic_xchg) --- + +const _ATOMIC_RMW_OPS = ( + (:add, :atomic_add_tile), + (:xchg, :atomic_xchg_tile), +) + +for (op, intrinsic) in _ATOMIC_RMW_OPS + fname = Symbol(:atomic_, op) + + # 1D with scalar value + @eval @inline function $fname(array::TileArray{T, 1}, indices::Tile{I}, val::T; + memory_order::Int=MemoryOrder.AcqRel, + memory_scope::Int=MemScope.Device) where {T, I <: Integer} + ptr_tile, mask, S = _atomic_ptrs_mask(array, indices) + val_tile = broadcast_to(Tile(val), S) + Intrinsics.$intrinsic(ptr_tile, val_tile, mask, memory_order, memory_scope) + end + + # 1D with tile value + @eval @inline function $fname(array::TileArray{T, 1}, indices::Tile{I}, val::Tile{T}; + memory_order::Int=MemoryOrder.AcqRel, + memory_scope::Int=MemScope.Device) where {T, I <: Integer} + ptr_tile, mask, _ = _atomic_ptrs_mask(array, indices) + Intrinsics.$intrinsic(ptr_tile, val, mask, memory_order, memory_scope) + end + + # 2D with scalar value + @eval @inline function $fname(array::TileArray{T, 2}, + indices::Tuple{Tile{I0}, Tile{I1}}, val::T; + memory_order::Int=MemoryOrder.AcqRel, + memory_scope::Int=MemScope.Device) where {T, I0 <: Integer, I1 <: Integer} + ptr_tile, mask, S = _atomic_ptrs_mask(array, indices) + val_tile = broadcast_to(Tile(val), S) + Intrinsics.$intrinsic(ptr_tile, val_tile, mask, memory_order, memory_scope) + end + + # 2D with tile value + @eval @inline function $fname(array::TileArray{T, 2}, + indices::Tuple{Tile{I0}, Tile{I1}}, val::Tile{T}; + memory_order::Int=MemoryOrder.AcqRel, + memory_scope::Int=MemScope.Device) where {T, I0 <: Integer, I1 <: Integer} + ptr_tile, mask, S = _atomic_ptrs_mask(array, indices) + val_bc = broadcast_to(val, S) + Intrinsics.$intrinsic(ptr_tile, val_bc, mask, memory_order, memory_scope) + end +end + +# --- CAS operations (separate due to different signature) --- + +# 1D with scalar expected/desired +@inline function atomic_cas(array::TileArray{T, 1}, indices::Tile{I}, + expected::T, desired::T; + memory_order::Int=MemoryOrder.AcqRel, + memory_scope::Int=MemScope.Device) where {T, I <: Integer} + ptr_tile, mask, S = _atomic_ptrs_mask(array, indices) + expected_tile = broadcast_to(Tile(expected), S) + desired_tile = broadcast_to(Tile(desired), S) + Intrinsics.atomic_cas_tile(ptr_tile, expected_tile, desired_tile, mask, + memory_order, memory_scope) +end + +# 1D with tile expected/desired +@inline function atomic_cas(array::TileArray{T, 1}, indices::Tile{I}, + expected::Tile{T}, desired::Tile{T}; + memory_order::Int=MemoryOrder.AcqRel, + memory_scope::Int=MemScope.Device) where {T, I <: Integer} + ptr_tile, mask, _ = _atomic_ptrs_mask(array, indices) + Intrinsics.atomic_cas_tile(ptr_tile, expected, desired, mask, + memory_order, memory_scope) +end + +# 2D with scalar expected/desired +@inline function atomic_cas(array::TileArray{T, 2}, + indices::Tuple{Tile{I0}, Tile{I1}}, + expected::T, desired::T; + memory_order::Int=MemoryOrder.AcqRel, + memory_scope::Int=MemScope.Device) where {T, I0 <: Integer, I1 <: Integer} + ptr_tile, mask, S = _atomic_ptrs_mask(array, indices) + expected_tile = broadcast_to(Tile(expected), S) + desired_tile = broadcast_to(Tile(desired), S) + Intrinsics.atomic_cas_tile(ptr_tile, expected_tile, desired_tile, mask, + memory_order, memory_scope) +end + +# 2D with tile expected/desired +@inline function atomic_cas(array::TileArray{T, 2}, + indices::Tuple{Tile{I0}, Tile{I1}}, + expected::Tile{T}, desired::Tile{T}; + memory_order::Int=MemoryOrder.AcqRel, + memory_scope::Int=MemScope.Device) where {T, I0 <: Integer, I1 <: Integer} + ptr_tile, mask, S = _atomic_ptrs_mask(array, indices) + expected_bc = broadcast_to(expected, S) + desired_bc = broadcast_to(desired, S) + Intrinsics.atomic_cas_tile(ptr_tile, expected_bc, desired_bc, mask, + memory_order, memory_scope) +end diff --git a/test/codegen/operations.jl b/test/codegen/operations.jl index 8da55a9..57a2f31 100644 --- a/test/codegen/operations.jl +++ b/test/codegen/operations.jl @@ -1418,6 +1418,64 @@ end end end + + @testset "tile-indexed atomic_cas_tko" begin + spec = ct.ArraySpec{1}(16, true) + @test @filecheck begin + @check_label "entry" + code_tiled(Tuple{ct.TileArray{Int32,1,spec}}) do arr + @check "iota" + indices = ct.arange((16,), Int) + @check "offset" + @check "atomic_cas_tko" + ct.atomic_cas(arr, indices, Int32(0), Int32(1)) + return + end + end + end + + @testset "tile-indexed atomic_rmw_tko" begin + spec = ct.ArraySpec{1}(16, true) + # xchg + @test @filecheck begin + @check_label "entry" + code_tiled(Tuple{ct.TileArray{Int32,1,spec}}) do arr + @check "iota" + indices = ct.arange((16,), Int) + @check "offset" + @check "atomic_rmw_tko" + ct.atomic_xchg(arr, indices, Int32(42)) + return + end + end + + # add (integer) + @test @filecheck begin + @check_label "entry" + code_tiled(Tuple{ct.TileArray{Int32,1,spec}}) do arr + @check "iota" + indices = ct.arange((16,), Int) + @check "offset" + @check "atomic_rmw_tko" + ct.atomic_add(arr, indices, Int32(1)) + return + end + end + + # add (float) + spec_f32 = ct.ArraySpec{1}(16, true) + @test @filecheck begin + @check_label "entry" + code_tiled(Tuple{ct.TileArray{Float32,1,spec_f32}}) do arr + @check "iota" + indices = ct.arange((16,), Int) + @check "offset" + @check "atomic_rmw_tko" + ct.atomic_add(arr, indices, 1.5f0) + return + end + end + end end #========================================================================= diff --git a/test/execution/atomics.jl b/test/execution/atomics.jl index 81ffe19..60c93d4 100644 --- a/test/execution/atomics.jl +++ b/test/execution/atomics.jl @@ -166,6 +166,154 @@ end @test result == n_blocks end +# ============================================================================ +# Tile-indexed atomic operations (scatter-gather style indexing) +# ============================================================================ + +@testset "atomic_add tile-indexed 1D" begin + function atomic_add_tile_kernel(arr::ct.TileArray{Int,1}, TILE::Int) + bid = ct.bid(1) + base = (bid - 1) * TILE + indices = base .+ ct.arange((TILE,), Int) + ct.atomic_add(arr, indices, 1; + memory_order=ct.MemoryOrder.AcqRel) + return + end + + tile_size = 16 + n = 256 + n_blocks = div(n, tile_size) + arr = CUDA.zeros(Int, n) + + ct.launch(atomic_add_tile_kernel, n_blocks, arr, ct.Constant(tile_size)) + + @test all(Array(arr) .== 1) +end + +@testset "atomic_add tile-indexed returns old values" begin + function atomic_add_return_kernel(arr::ct.TileArray{Int,1}, out::ct.TileArray{Int,1}) + indices = ct.arange((16,), Int) + old_vals = ct.atomic_add(arr, indices, 1; + memory_order=ct.MemoryOrder.AcqRel) + ct.scatter(out, indices, old_vals) + return + end + + arr = CUDA.zeros(Int, 16) + out = CUDA.fill(Int(-1), 16) + + ct.launch(atomic_add_return_kernel, 1, arr, out) + + @test all(Array(out) .== 0) + @test all(Array(arr) .== 1) +end + +@testset "atomic_add tile-indexed Float32" begin + function atomic_add_f32_tile_kernel(arr::ct.TileArray{Float32,1}, TILE::Int) + bid = ct.bid(1) + base = (bid - 1) * TILE + indices = base .+ ct.arange((TILE,), Int) + ct.atomic_add(arr, indices, 1.5f0; + memory_order=ct.MemoryOrder.AcqRel) + return + end + + tile_size = 16 + n = 256 + n_blocks = div(n, tile_size) + arr = CUDA.zeros(Float32, n) + + ct.launch(atomic_add_f32_tile_kernel, n_blocks, arr, ct.Constant(tile_size)) + + @test all(isapprox.(Array(arr), 1.5f0)) +end + +@testset "atomic_add tile-indexed with tile values" begin + function atomic_add_tile_val_kernel(arr::ct.TileArray{Int,1}, + vals::ct.TileArray{Int,1}) + indices = ct.arange((16,), Int) + val_tile = ct.gather(vals, indices) + ct.atomic_add(arr, indices, val_tile; + memory_order=ct.MemoryOrder.AcqRel) + return + end + + arr = CUDA.zeros(Int, 16) + vals = CuArray(collect(Int, 1:16)) + + ct.launch(atomic_add_tile_val_kernel, 1, arr, vals) + + @test Array(arr) == collect(1:16) +end + +@testset "atomic_xchg tile-indexed" begin + function atomic_xchg_tile_kernel(arr::ct.TileArray{Int,1}) + indices = ct.arange((16,), Int) + ct.atomic_xchg(arr, indices, 42; + memory_order=ct.MemoryOrder.AcqRel) + return + end + + arr = CUDA.zeros(Int, 16) + + ct.launch(atomic_xchg_tile_kernel, 1, arr) + + @test all(Array(arr) .== 42) +end + +@testset "atomic_cas tile-indexed success" begin + function atomic_cas_tile_kernel(arr::ct.TileArray{Int,1}, out::ct.TileArray{Int,1}) + indices = ct.arange((16,), Int) + old_vals = ct.atomic_cas(arr, indices, 0, 1; + memory_order=ct.MemoryOrder.AcqRel) + ct.scatter(out, indices, old_vals) + return + end + + arr = CUDA.zeros(Int, 16) + out = CUDA.fill(Int(-1), 16) + + ct.launch(atomic_cas_tile_kernel, 1, arr, out) + + @test all(Array(out) .== 0) + @test all(Array(arr) .== 1) +end + +@testset "atomic_cas tile-indexed failure" begin + function atomic_cas_fail_kernel(arr::ct.TileArray{Int,1}, out::ct.TileArray{Int,1}) + indices = ct.arange((16,), Int) + old_vals = ct.atomic_cas(arr, indices, 0, 2; + memory_order=ct.MemoryOrder.AcqRel) + ct.scatter(out, indices, old_vals) + return + end + + arr = CUDA.fill(Int(1), 16) + out = CUDA.fill(Int(-1), 16) + + ct.launch(atomic_cas_fail_kernel, 1, arr, out) + + @test all(Array(out) .== 1) # old values returned + @test all(Array(arr) .== 1) # unchanged (CAS failed) +end + +@testset "atomic_add tile-indexed out-of-bounds" begin + function atomic_add_oob_kernel(arr::ct.TileArray{Int,1}) + # Index tile is larger than array — OOB elements should be masked + indices = ct.arange((16,), Int) + ct.atomic_add(arr, indices, 1; + memory_order=ct.MemoryOrder.AcqRel) + return + end + + arr = CUDA.zeros(Int, 8) + + ct.launch(atomic_add_oob_kernel, 1, arr) + + # Only first 8 elements should be updated + @test all(Array(arr) .== 1) +end + @testset "1D gather - simple" begin # Simple 1D gather: copy first 16 elements using gather function gather_simple_kernel(src::ct.TileArray{Float32,1}, dst::ct.TileArray{Float32,1}) From 732bf63f6b1b1691286177c3cf7aef125211ef9f Mon Sep 17 00:00:00 2001 From: AntonOresten Date: Mon, 23 Feb 2026 11:53:02 +0100 Subject: [PATCH 2/5] generalize to N dimensions --- src/language/atomics.jl | 147 ++++++++++++++++--------------------- test/codegen/operations.jl | 17 +++++ test/execution/atomics.jl | 18 +++++ 3 files changed, 100 insertions(+), 82 deletions(-) diff --git a/src/language/atomics.jl b/src/language/atomics.jl index c3ee839..893c0be 100644 --- a/src/language/atomics.jl +++ b/src/language/atomics.jl @@ -82,54 +82,48 @@ old_val = ct.atomic_add(counters, idx, Int32(1)) end # ============================================================================ -# Tile-indexed atomic operations (scatter-gather style indexing) +# Tile-indexed atomic operations # These accept Tile indices to perform atomic operations on multiple elements. # ============================================================================ -# --- Pointer/mask helpers (same pattern as gather/scatter in operations.jl) --- +# --- Pointer/mask helper (N-dimensional) --- -@inline function _atomic_ptrs_mask(array::TileArray{T, 1}, indices::Tile{I}) where {T, I <: Integer} - indices_0 = indices .- one(I) - indices_i32 = convert(Tile{Int32}, indices_0) - ptr_tile = Intrinsics.offset(array.ptr, indices_i32) - zero_0d = Tile(Int32(0)) - size_0d = Tile(size(array, 1)) - mask = (indices_i32 .>= zero_0d) .& (indices_i32 .< size_0d) - (ptr_tile, mask, size(indices)) -end - -@inline function _atomic_ptrs_mask(array::TileArray{T, 2}, - indices::Tuple{Tile{I0}, Tile{I1}}) where {T, I0 <: Integer, I1 <: Integer} - idx0_0 = indices[1] .- one(I0) - idx1_0 = indices[2] .- one(I1) +@inline function _atomic_ptrs_mask(array::TileArray{T, N}, + indices::NTuple{N, Tile{<:Integer}}) where {T, N} + # Convert each index to 0-indexed + indices_0 = ntuple(Val(N)) do d + indices[d] .- one(eltype(indices[d])) + end - S = broadcast_shape(size(indices[1]), size(indices[2])) - idx0_bc = broadcast_to(idx0_0, S) - idx1_bc = broadcast_to(idx1_0, S) + # Broadcast all index tiles to a common shape + S = reduce(broadcast_shape, ntuple(d -> size(indices[d]), Val(N))) - idx0_i32 = convert(Tile{Int32}, idx0_bc) - idx1_i32 = convert(Tile{Int32}, idx1_bc) + # Broadcast and convert to Int32 + indices_i32 = ntuple(Val(N)) do d + convert(Tile{Int32}, broadcast_to(indices_0[d], S)) + end - stride0_0d = Tile(array.strides[1]) - stride1_0d = Tile(array.strides[2]) - stride0 = broadcast_to(stride0_0d, S) - stride1 = broadcast_to(stride1_0d, S) + # Linear index: sum(idx[d] * stride[d]) + linear_idx = reduce(.+, ntuple(Val(N)) do d + indices_i32[d] .* broadcast_to(Tile(array.strides[d]), S) + end) - linear_idx = idx0_i32 .* stride0 + idx1_i32 .* stride1 ptr_tile = Intrinsics.offset(array.ptr, linear_idx) - zero_0d = Tile(Int32(0)) - zero_bc = broadcast_to(zero_0d, S) - size0_bc = broadcast_to(Tile(size(array, 1)), S) - size1_bc = broadcast_to(Tile(size(array, 2)), S) - - mask0 = (idx0_i32 .>= zero_bc) .& (idx0_i32 .< size0_bc) - mask1 = (idx1_i32 .>= zero_bc) .& (idx1_i32 .< size1_bc) - mask = mask0 .& mask1 + # Bounds mask: 0 <= idx[d] < size[d] for all d + zero_bc = broadcast_to(Tile(Int32(0)), S) + mask = reduce(.&, ntuple(Val(N)) do d + (indices_i32[d] .>= zero_bc) .& (indices_i32[d] .< broadcast_to(Tile(size(array, d)), S)) + end) (ptr_tile, mask, S) end +# 1D convenience: single Tile -> 1-tuple +@inline function _atomic_ptrs_mask(array::TileArray{T, 1}, indices::Tile{<:Integer}) where {T} + _atomic_ptrs_mask(array, (indices,)) +end + # --- RMW operations (atomic_add, atomic_xchg) --- const _ATOMIC_RMW_OPS = ( @@ -140,51 +134,48 @@ const _ATOMIC_RMW_OPS = ( for (op, intrinsic) in _ATOMIC_RMW_OPS fname = Symbol(:atomic_, op) - # 1D with scalar value - @eval @inline function $fname(array::TileArray{T, 1}, indices::Tile{I}, val::T; + # N-D with scalar value + @eval @inline function $fname(array::TileArray{T, N}, + indices::NTuple{N, Tile{<:Integer}}, val::T; memory_order::Int=MemoryOrder.AcqRel, - memory_scope::Int=MemScope.Device) where {T, I <: Integer} + memory_scope::Int=MemScope.Device) where {T, N} ptr_tile, mask, S = _atomic_ptrs_mask(array, indices) val_tile = broadcast_to(Tile(val), S) Intrinsics.$intrinsic(ptr_tile, val_tile, mask, memory_order, memory_scope) end - # 1D with tile value - @eval @inline function $fname(array::TileArray{T, 1}, indices::Tile{I}, val::Tile{T}; + # N-D with tile value + @eval @inline function $fname(array::TileArray{T, N}, + indices::NTuple{N, Tile{<:Integer}}, val::Tile{T}; memory_order::Int=MemoryOrder.AcqRel, - memory_scope::Int=MemScope.Device) where {T, I <: Integer} - ptr_tile, mask, _ = _atomic_ptrs_mask(array, indices) - Intrinsics.$intrinsic(ptr_tile, val, mask, memory_order, memory_scope) + memory_scope::Int=MemScope.Device) where {T, N} + ptr_tile, mask, S = _atomic_ptrs_mask(array, indices) + val_bc = broadcast_to(val, S) + Intrinsics.$intrinsic(ptr_tile, val_bc, mask, memory_order, memory_scope) end - # 2D with scalar value - @eval @inline function $fname(array::TileArray{T, 2}, - indices::Tuple{Tile{I0}, Tile{I1}}, val::T; + # 1D convenience: single Tile index + @eval @inline function $fname(array::TileArray{T, 1}, indices::Tile{<:Integer}, val::T; memory_order::Int=MemoryOrder.AcqRel, - memory_scope::Int=MemScope.Device) where {T, I0 <: Integer, I1 <: Integer} - ptr_tile, mask, S = _atomic_ptrs_mask(array, indices) - val_tile = broadcast_to(Tile(val), S) - Intrinsics.$intrinsic(ptr_tile, val_tile, mask, memory_order, memory_scope) + memory_scope::Int=MemScope.Device) where {T} + $fname(array, (indices,), val; memory_order, memory_scope) end - # 2D with tile value - @eval @inline function $fname(array::TileArray{T, 2}, - indices::Tuple{Tile{I0}, Tile{I1}}, val::Tile{T}; + @eval @inline function $fname(array::TileArray{T, 1}, indices::Tile{<:Integer}, val::Tile{T}; memory_order::Int=MemoryOrder.AcqRel, - memory_scope::Int=MemScope.Device) where {T, I0 <: Integer, I1 <: Integer} - ptr_tile, mask, S = _atomic_ptrs_mask(array, indices) - val_bc = broadcast_to(val, S) - Intrinsics.$intrinsic(ptr_tile, val_bc, mask, memory_order, memory_scope) + memory_scope::Int=MemScope.Device) where {T} + $fname(array, (indices,), val; memory_order, memory_scope) end end # --- CAS operations (separate due to different signature) --- -# 1D with scalar expected/desired -@inline function atomic_cas(array::TileArray{T, 1}, indices::Tile{I}, +# N-D with scalar expected/desired +@inline function atomic_cas(array::TileArray{T, N}, + indices::NTuple{N, Tile{<:Integer}}, expected::T, desired::T; memory_order::Int=MemoryOrder.AcqRel, - memory_scope::Int=MemScope.Device) where {T, I <: Integer} + memory_scope::Int=MemScope.Device) where {T, N} ptr_tile, mask, S = _atomic_ptrs_mask(array, indices) expected_tile = broadcast_to(Tile(expected), S) desired_tile = broadcast_to(Tile(desired), S) @@ -192,38 +183,30 @@ end memory_order, memory_scope) end -# 1D with tile expected/desired -@inline function atomic_cas(array::TileArray{T, 1}, indices::Tile{I}, +# N-D with tile expected/desired +@inline function atomic_cas(array::TileArray{T, N}, + indices::NTuple{N, Tile{<:Integer}}, expected::Tile{T}, desired::Tile{T}; memory_order::Int=MemoryOrder.AcqRel, - memory_scope::Int=MemScope.Device) where {T, I <: Integer} - ptr_tile, mask, _ = _atomic_ptrs_mask(array, indices) - Intrinsics.atomic_cas_tile(ptr_tile, expected, desired, mask, + memory_scope::Int=MemScope.Device) where {T, N} + ptr_tile, mask, S = _atomic_ptrs_mask(array, indices) + expected_bc = broadcast_to(expected, S) + desired_bc = broadcast_to(desired, S) + Intrinsics.atomic_cas_tile(ptr_tile, expected_bc, desired_bc, mask, memory_order, memory_scope) end -# 2D with scalar expected/desired -@inline function atomic_cas(array::TileArray{T, 2}, - indices::Tuple{Tile{I0}, Tile{I1}}, +# 1D convenience: single Tile index +@inline function atomic_cas(array::TileArray{T, 1}, indices::Tile{<:Integer}, expected::T, desired::T; memory_order::Int=MemoryOrder.AcqRel, - memory_scope::Int=MemScope.Device) where {T, I0 <: Integer, I1 <: Integer} - ptr_tile, mask, S = _atomic_ptrs_mask(array, indices) - expected_tile = broadcast_to(Tile(expected), S) - desired_tile = broadcast_to(Tile(desired), S) - Intrinsics.atomic_cas_tile(ptr_tile, expected_tile, desired_tile, mask, - memory_order, memory_scope) + memory_scope::Int=MemScope.Device) where {T} + atomic_cas(array, (indices,), expected, desired; memory_order, memory_scope) end -# 2D with tile expected/desired -@inline function atomic_cas(array::TileArray{T, 2}, - indices::Tuple{Tile{I0}, Tile{I1}}, +@inline function atomic_cas(array::TileArray{T, 1}, indices::Tile{<:Integer}, expected::Tile{T}, desired::Tile{T}; memory_order::Int=MemoryOrder.AcqRel, - memory_scope::Int=MemScope.Device) where {T, I0 <: Integer, I1 <: Integer} - ptr_tile, mask, S = _atomic_ptrs_mask(array, indices) - expected_bc = broadcast_to(expected, S) - desired_bc = broadcast_to(desired, S) - Intrinsics.atomic_cas_tile(ptr_tile, expected_bc, desired_bc, mask, - memory_order, memory_scope) + memory_scope::Int=MemScope.Device) where {T} + atomic_cas(array, (indices,), expected, desired; memory_order, memory_scope) end diff --git a/test/codegen/operations.jl b/test/codegen/operations.jl index 57a2f31..89782c1 100644 --- a/test/codegen/operations.jl +++ b/test/codegen/operations.jl @@ -1434,6 +1434,23 @@ end end + @testset "tile-indexed 3D atomic_add" begin + spec3d = ct.ArraySpec{3}(16, true) + @test @filecheck begin + @check_label "entry" + code_tiled(Tuple{ct.TileArray{Int32,3,spec3d}}) do arr + @check "iota" + i = ct.arange((4,), Int) + j = ct.arange((4,), Int) + k = ct.arange((4,), Int) + @check "offset" + @check "atomic_rmw_tko" + ct.atomic_add(arr, (i, j, k), Int32(1)) + return + end + end + end + @testset "tile-indexed atomic_rmw_tko" begin spec = ct.ArraySpec{1}(16, true) # xchg diff --git a/test/execution/atomics.jl b/test/execution/atomics.jl index 60c93d4..e757ed8 100644 --- a/test/execution/atomics.jl +++ b/test/execution/atomics.jl @@ -314,6 +314,24 @@ end @test all(Array(arr) .== 1) end +@testset "atomic_add tile-indexed 3D" begin + function atomic_add_3d_kernel(arr::ct.TileArray{Int,3}) + # 3D index tiles — each is length 4, will broadcast to (4,4,4) = 64 elements + i = ct.reshape(ct.arange((4,), Int), (4, 1, 1)) + j = ct.reshape(ct.arange((4,), Int), (1, 4, 1)) + k = ct.reshape(ct.arange((4,), Int), (1, 1, 4)) + ct.atomic_add(arr, (i, j, k), 1; + memory_order=ct.MemoryOrder.AcqRel) + return + end + + arr = CUDA.zeros(Int, 4, 4, 4) + + ct.launch(atomic_add_3d_kernel, 1, arr) + + @test all(Array(arr) .== 1) +end + @testset "1D gather - simple" begin # Simple 1D gather: copy first 16 elements using gather function gather_simple_kernel(src::ct.TileArray{Float32,1}, dst::ct.TileArray{Float32,1}) From 55e69b1757b28b89017f58899c72f793c035e61e Mon Sep 17 00:00:00 2001 From: AntonOresten Date: Mon, 23 Feb 2026 13:57:18 +0100 Subject: [PATCH 3/5] Add tile-space atomic operations --- bench/atomic_vs_store.jl | 232 +++++++++++++++++++++++++++++++++++++ src/language/atomics.jl | 76 ++++++++++++ test/codegen/operations.jl | 15 +++ test/execution/atomics.jl | 148 +++++++++++++++++++++++ 4 files changed, 471 insertions(+) create mode 100644 bench/atomic_vs_store.jl diff --git a/bench/atomic_vs_store.jl b/bench/atomic_vs_store.jl new file mode 100644 index 0000000..59bcb7a --- /dev/null +++ b/bench/atomic_vs_store.jl @@ -0,0 +1,232 @@ +#!/usr/bin/env julia +# +# Benchmark: ct.store vs ct.atomic_add (tile-space) +# +# 1D: Compares atomic_add vs store when there's no contention. +# 2D: Compares flattened 1D (N*N) vs native 2D (NxN) tile-space atomic_add. +# +# Usage: +# julia --project=test bench/atomic_vs_store.jl + +using cuTile +import cuTile as ct +using CUDA + +# --- 1D Kernels --- + +function store_1d_kernel(dst::ct.TileArray{Float32,1}, TILE::Int) + bid = ct.bid(1) + tile = ct.full((TILE,), 1.0f0, Float32) + ct.store(dst, bid, tile) + return +end + +function atomic_add_1d_kernel(dst::ct.TileArray{Float32,1}, TILE::Int) + bid = ct.bid(1) + tile = ct.full((TILE,), 1.0f0, Float32) + ct.atomic_add(dst, bid, tile) + return +end + +# --- 2D Kernels --- + +# Flat: treat N*N as a 1D array, one tile per block +function atomic_add_flat_kernel(dst::ct.TileArray{Float32,1}, TILE::Int) + bid = ct.bid(1) + tile = ct.full((TILE,), 1.0f0, Float32) + ct.atomic_add(dst, bid, tile) + return +end + +# Native 2D: NxN tile-space index +function atomic_add_2d_kernel(dst::ct.TileArray{Float32,2}, TILE_R::Int, NCOLS::Int) + bid = ct.bid(1) + # bid is linear over the 2D tile grid; convert to (row_tile, col_tile) + row = (bid - Int32(1)) ÷ NCOLS + Int32(1) + col = (bid - Int32(1)) % NCOLS + Int32(1) + tile = ct.full((TILE_R, TILE_R), 1.0f0, Float32) + ct.atomic_add(dst, (row, col), tile) + return +end + +# --- Benchmark harness --- + +function bench(f, grid, args...; warmup=5, iters=100, reset=nothing, kwargs...) + for _ in 1:warmup + reset !== nothing && reset() + ct.launch(f, grid, args...; kwargs...) + end + CUDA.synchronize() + + times = Float64[] + for _ in 1:iters + reset !== nothing && reset() + CUDA.synchronize() + t = CUDA.@elapsed begin + ct.launch(f, grid, args...; kwargs...) + end + push!(times, t) + end + + sort!(times) + trim = max(1, iters ÷ 10) + trimmed = times[trim+1:end-trim] + + return ( + median = trimmed[length(trimmed) ÷ 2] * 1e6, + mean = sum(trimmed) / length(trimmed) * 1e6, + min = times[1] * 1e6, + max = times[end] * 1e6, + ) +end + +function print_result(label, t; reference=nothing) + line = " $(rpad(label, 14))$(lpad(round(t.median, digits=2), 8))μs (min $(round(t.min, digits=2))μs)" + if reference !== nothing + ratio = t.median / reference.median + line *= " $(round(ratio, digits=2))x" + end + println(line) +end + +# --- 1D benchmark --- + +function bench_1d() + TILE = 128 + println("=" ^ 60) + println("1D: ct.store vs ct.atomic_add (no contention)") + println(" Each block writes its own tile of $TILE Float32s") + println("=" ^ 60) + println() + + for n_tiles in [64, 256, 1024, 4096, 16384, 65536] + n = n_tiles * TILE + + dst_store = CUDA.zeros(Float32, n) + dst_atomic = CUDA.zeros(Float32, n) + + # Correctness + ct.launch(store_1d_kernel, n_tiles, dst_store, ct.Constant(TILE)) + ct.launch(atomic_add_1d_kernel, n_tiles, dst_atomic, ct.Constant(TILE)) + CUDA.synchronize() + @assert all(Array(dst_store) .== 1.0f0) + @assert all(Array(dst_atomic) .== 1.0f0) + + t_store = bench(store_1d_kernel, n_tiles, dst_store, ct.Constant(TILE)) + t_atomic = bench(atomic_add_1d_kernel, n_tiles, dst_atomic, ct.Constant(TILE); + reset=() -> CUDA.fill!(dst_atomic, 0)) + + println("$(lpad(n_tiles, 4)) tiles × $TILE = $(lpad(n, 7)) elements") + print_result("store", t_store) + print_result("atomic_add", t_atomic; reference=t_store) + println() + end +end + +# --- 2D benchmark --- + +function bench_2d() + println("=" ^ 60) + println("2D: flat 1D (N*N) vs native 2D (NxN) atomic_add") + println(" Same total elements, different indexing strategies") + println("=" ^ 60) + println() + + for (tile_r, grid_r) in [(8, 8), (8, 16), (16, 16), (16, 32), (32, 32), (32, 64), (32, 128), (64, 64), (64, 128)] + n_rows = tile_r * grid_r + n_cols = n_rows + n = n_rows * n_cols + n_tiles_flat = n ÷ (tile_r * tile_r) # total tiles when flattened + n_col_tiles = n_cols ÷ tile_r + + dst_flat = CUDA.zeros(Float32, n) + dst_2d = CUDA.zeros(Float32, n_rows, n_cols) + + # Correctness + ct.launch(atomic_add_flat_kernel, n_tiles_flat, dst_flat, ct.Constant(tile_r * tile_r)) + ct.launch(atomic_add_2d_kernel, n_tiles_flat, dst_2d, + ct.Constant(tile_r), ct.Constant(n_col_tiles)) + CUDA.synchronize() + @assert all(Array(dst_flat) .== 1.0f0) "flat failed at $(n_rows)×$(n_cols)" + @assert all(Array(dst_2d) .== 1.0f0) "2D failed at $(n_rows)×$(n_cols)" + + t_flat = bench(atomic_add_flat_kernel, n_tiles_flat, dst_flat, ct.Constant(tile_r * tile_r); + reset=() -> CUDA.fill!(dst_flat, 0)) + t_2d = bench(atomic_add_2d_kernel, n_tiles_flat, dst_2d, + ct.Constant(tile_r), ct.Constant(n_col_tiles); + reset=() -> CUDA.fill!(dst_2d, 0)) + + println("$(n_rows)×$(n_cols) = $(lpad(n, 7)) elements ($(tile_r)×$(tile_r) tiles, $(n_tiles_flat) blocks)") + print_result("flat 1D", t_flat) + print_result("native 2D", t_2d; reference=t_flat) + println() + end +end + +# --- 3D trailing singleton benchmark --- + +# NxN tile in a 2D array +function atomic_add_2d_ref_kernel(dst::ct.TileArray{Float32,2}, TILE_R::Int, NCOLS::Int) + bid = ct.bid(1) + row = (bid - Int32(1)) ÷ NCOLS + Int32(1) + col = (bid - Int32(1)) % NCOLS + Int32(1) + tile = ct.full((TILE_R, TILE_R), 1.0f0, Float32) + ct.atomic_add(dst, (row, col), tile) + return +end + +# NxNx1 tile in a 3D array (trailing singleton) +function atomic_add_3d_singleton_kernel(dst::ct.TileArray{Float32,3}, TILE_R::Int, NCOLS::Int) + bid = ct.bid(1) + row = (bid - Int32(1)) ÷ NCOLS + Int32(1) + col = (bid - Int32(1)) % NCOLS + Int32(1) + tile = ct.full((TILE_R, TILE_R, 1), 1.0f0, Float32) + ct.atomic_add(dst, (row, col, Int32(1)), tile) + return +end + +function bench_trailing_singleton() + println("=" ^ 60) + println("Trailing singleton: 2D (NxN) vs 3D (NxNx1) atomic_add") + println(" Same data, extra singleton dimension in 3D") + println("=" ^ 60) + println() + + for (tile_r, grid_r) in [(8, 8), (8, 16), (16, 16), (16, 32), (32, 32), (32, 64), (32, 128), (64, 64), (64, 128)] + n_rows = tile_r * grid_r + n_cols = n_rows + n = n_rows * n_cols + n_tiles = n ÷ (tile_r * tile_r) + n_col_tiles = n_cols ÷ tile_r + + dst_2d = CUDA.zeros(Float32, n_rows, n_cols) + dst_3d = CUDA.zeros(Float32, n_rows, n_cols, 1) + + # Correctness + ct.launch(atomic_add_2d_ref_kernel, n_tiles, dst_2d, + ct.Constant(tile_r), ct.Constant(n_col_tiles)) + ct.launch(atomic_add_3d_singleton_kernel, n_tiles, dst_3d, + ct.Constant(tile_r), ct.Constant(n_col_tiles)) + CUDA.synchronize() + @assert all(Array(dst_2d) .== 1.0f0) "2D failed" + @assert all(Array(dst_3d) .== 1.0f0) "3D singleton failed" + + t_2d = bench(atomic_add_2d_ref_kernel, n_tiles, dst_2d, + ct.Constant(tile_r), ct.Constant(n_col_tiles); + reset=() -> CUDA.fill!(dst_2d, 0)) + t_3d = bench(atomic_add_3d_singleton_kernel, n_tiles, dst_3d, + ct.Constant(tile_r), ct.Constant(n_col_tiles); + reset=() -> CUDA.fill!(dst_3d, 0)) + + println("$(n_rows)×$(n_cols) = $(lpad(n, 7)) elements ($(tile_r)×$(tile_r) tiles, $(n_tiles) blocks)") + print_result("2D (NxN)", t_2d) + print_result("3D (NxNx1)", t_3d; reference=t_2d) + println() + end +end + +# --- Run --- + +bench_1d() +bench_2d() +bench_trailing_singleton() diff --git a/src/language/atomics.jl b/src/language/atomics.jl index 893c0be..ba02e96 100644 --- a/src/language/atomics.jl +++ b/src/language/atomics.jl @@ -166,6 +166,24 @@ for (op, intrinsic) in _ATOMIC_RMW_OPS memory_scope::Int=MemScope.Device) where {T} $fname(array, (indices,), val; memory_order, memory_scope) end + + # Tile-space: N-D tuple index + tile value (like store) + @eval @inline function $fname(array::TileArray{T, N}, + index::NTuple{N, Integer}, tile::Tile{T}; + memory_order::Int=MemoryOrder.AcqRel, + memory_scope::Int=MemScope.Device) where {T, N} + reshaped = _reshape_to_rank(tile, Val(N)) + ptr_tile, mask = _tile_space_ptrs_mask(array, index, Val(size(reshaped))) + Intrinsics.$intrinsic(ptr_tile, reshaped, mask, memory_order, memory_scope) + end + + # Tile-space: 1D convenience (scalar index) + @eval @inline function $fname(array::TileArray{T, 1}, + index::Integer, tile::Tile{T}; + memory_order::Int=MemoryOrder.AcqRel, + memory_scope::Int=MemScope.Device) where {T} + $fname(array, (index,), tile; memory_order, memory_scope) + end end # --- CAS operations (separate due to different signature) --- @@ -210,3 +228,61 @@ end memory_scope::Int=MemScope.Device) where {T} atomic_cas(array, (indices,), expected, desired; memory_order, memory_scope) end + +# ============================================================================ +# Tile-space atomic operations +# These accept tile-space integer indices (like store) to atomically operate +# on contiguous tile-shaped blocks of an array. +# ============================================================================ + +# --- Pointer/mask helper for tile-space indexing --- + +@inline function _tile_space_ptrs_mask(array::TileArray{T, N}, + index::NTuple{N, Integer}, + ::Val{Shape}) where {T, N, Shape} + # Build per-dimension element index tiles (1-indexed) + # For dim d: arange [1..Shape[d]], reshaped for N-D broadcasting, plus base offset + idx_tiles = ntuple(Val(N)) do d + bcast_shape = ntuple(i -> i == d ? Shape[d] : 1, Val(N)) + base = Int32((index[d] - 1) * Shape[d]) + reshape(arange((Shape[d],), Int32), bcast_shape) .+ Tile(base) + end + + # 0-indexed linear offset: sum((idx[d] - 1) * stride[d]) + linear_idx = reduce(.+, ntuple(Val(N)) do d + (idx_tiles[d] .- Tile(Int32(1))) .* Tile(array.strides[d]) + end) + + ptr_tile = Intrinsics.offset(array.ptr, linear_idx) + + # Bounds mask: 1 <= idx[d] <= size(array, d) for all d + mask = reduce(.&, ntuple(Val(N)) do d + (idx_tiles[d] .>= Tile(Int32(1))) .& (idx_tiles[d] .<= Tile(size(array, d))) + end) + + (ptr_tile, mask) +end + +# --- Tile-space CAS --- + +# N-D tuple index +@inline function atomic_cas(array::TileArray{T, N}, + index::NTuple{N, Integer}, + expected::Tile{T}, desired::Tile{T}; + memory_order::Int=MemoryOrder.AcqRel, + memory_scope::Int=MemScope.Device) where {T, N} + expected_r = _reshape_to_rank(expected, Val(N)) + desired_r = _reshape_to_rank(desired, Val(N)) + ptr_tile, mask = _tile_space_ptrs_mask(array, index, Val(size(expected_r))) + Intrinsics.atomic_cas_tile(ptr_tile, expected_r, desired_r, mask, + memory_order, memory_scope) +end + +# 1D convenience +@inline function atomic_cas(array::TileArray{T, 1}, + index::Integer, + expected::Tile{T}, desired::Tile{T}; + memory_order::Int=MemoryOrder.AcqRel, + memory_scope::Int=MemScope.Device) where {T} + atomic_cas(array, (index,), expected, desired; memory_order, memory_scope) +end diff --git a/test/codegen/operations.jl b/test/codegen/operations.jl index 89782c1..b030aa6 100644 --- a/test/codegen/operations.jl +++ b/test/codegen/operations.jl @@ -1493,6 +1493,21 @@ end end end + + @testset "tile-space atomic_add" begin + spec = ct.ArraySpec{1}(16, true) + @test @filecheck begin + @check_label "entry" + code_tiled(Tuple{ct.TileArray{Int32,1,spec}, Int}) do arr, bid + @check "iota" + tile = ct.full((16,), Int32(1), Int32) + @check "offset" + @check "atomic_rmw_tko" + ct.atomic_add(arr, bid, tile) + return + end + end + end end #========================================================================= diff --git a/test/execution/atomics.jl b/test/execution/atomics.jl index e757ed8..b16cba2 100644 --- a/test/execution/atomics.jl +++ b/test/execution/atomics.jl @@ -332,6 +332,154 @@ end @test all(Array(arr) .== 1) end +# Tile-space atomic operations (block-level indexing, like store) + +@testset "atomic_add tile-space 1D" begin + function atomic_add_ts_kernel(arr::ct.TileArray{Int,1}, TILE::Int) + bid = ct.bid(1) + tile = ct.full((TILE,), 1, Int) + ct.atomic_add(arr, bid, tile) + return + end + + TILE = 16 + arr = CUDA.zeros(Int, 64) + + ct.launch(atomic_add_ts_kernel, 4, arr, ct.Constant(TILE)) + + @test all(Array(arr) .== 1) +end + +@testset "atomic_add tile-space returns old values" begin + function atomic_add_ts_old_kernel(arr::ct.TileArray{Int,1}, + out::ct.TileArray{Int,1}) + bid = ct.bid(1) + tile = ct.full((16,), 1, Int) + old = ct.atomic_add(arr, bid, tile) + ct.store(out, bid, old) + return + end + + arr = CUDA.zeros(Int, 16) + out = CUDA.fill(Int(-1), 16) + + ct.launch(atomic_add_ts_old_kernel, 1, arr, out) + + @test all(Array(out) .== 0) # old values were 0 + @test all(Array(arr) .== 1) # now incremented +end + +@testset "atomic_xchg tile-space" begin + function atomic_xchg_ts_kernel(arr::ct.TileArray{Int,1}) + bid = ct.bid(1) + tile = ct.full((16,), 42, Int) + ct.atomic_xchg(arr, bid, tile) + return + end + + arr = CUDA.zeros(Int, 32) + + ct.launch(atomic_xchg_ts_kernel, 2, arr) + + @test all(Array(arr) .== 42) +end + +@testset "atomic_cas tile-space" begin + function atomic_cas_ts_kernel(arr::ct.TileArray{Int32,1}) + bid = ct.bid(1) + expected = ct.full((16,), Int32(0), Int32) + desired = ct.full((16,), Int32(1), Int32) + ct.atomic_cas(arr, bid, expected, desired) + return + end + + arr = CUDA.zeros(Int32, 32) + + ct.launch(atomic_cas_ts_kernel, 2, arr) + + @test all(Array(arr) .== 1) +end + +@testset "atomic_add tile-space 1D tuple index" begin + # Test the N-D path with a 1-tuple index (not the scalar convenience) + function atomic_add_ts_tuple1d_kernel(arr::ct.TileArray{Int,1}) + bid = ct.bid(1) + tile = ct.full((16,), 1, Int) + ct.atomic_add(arr, (bid,), tile) + return + end + + arr = CUDA.zeros(Int, 32) + + ct.launch(atomic_add_ts_tuple1d_kernel, 2, arr) + + @test all(Array(arr) .== 1) +end + +@testset "atomic_add tile-space 2D" begin + function atomic_add_ts_2d_kernel(arr::ct.TileArray{Int,2}) + bid = ct.bid(1) + tile = ct.full((4, 4), 1, Int) + ct.atomic_add(arr, (bid, Int32(1)), tile) + return + end + + arr = CUDA.zeros(Int, 4, 8) # 4 rows, 8 cols = 2 col-tiles of width 4 + + ct.launch(atomic_add_ts_2d_kernel, 1, arr) + + result = Array(arr) + @test all(result[:, 1:4] .== 1) # first col-tile updated + @test all(result[:, 5:8] .== 0) # second col-tile untouched +end + +@testset "atomic_add tile-space 2D both dims" begin + # 2 blocks: block 1 writes tile (1,1), block 2 writes tile (1,2) + function atomic_add_ts_2d_both_kernel(arr::ct.TileArray{Int,2}) + bid = ct.bid(1) + tile = ct.full((4, 4), 1, Int) + ct.atomic_add(arr, (Int32(1), bid), tile) + return + end + + arr = CUDA.zeros(Int, 4, 8) + + ct.launch(atomic_add_ts_2d_both_kernel, 2, arr) + + @test all(Array(arr) .== 1) +end + +@testset "atomic_add tile-space 3D" begin + function atomic_add_ts_3d_kernel(arr::ct.TileArray{Int,3}) + bid = ct.bid(1) + tile = ct.full((2, 2, 2), 1, Int) + ct.atomic_add(arr, (bid, Int32(1), Int32(1)), tile) + return + end + + arr = CUDA.zeros(Int, 4, 2, 2) # 2 tiles along dim 1 + + ct.launch(atomic_add_ts_3d_kernel, 2, arr) + + @test all(Array(arr) .== 1) +end + +@testset "atomic_add tile-space trailing singleton" begin + # 2D tile into a 3D array — tile should be auto-reshaped to (4, 4, 1) + function atomic_add_ts_trailing_kernel(arr::ct.TileArray{Int,3}) + bid = ct.bid(1) + tile = ct.full((4, 4), 1, Int) # 2D tile + ct.atomic_add(arr, (bid, Int32(1), Int32(1)), tile) # 3D index + return + end + + arr = CUDA.zeros(Int, 8, 4, 1) # 2 tiles along dim 1 + + ct.launch(atomic_add_ts_trailing_kernel, 2, arr) + + @test all(Array(arr) .== 1) +end + @testset "1D gather - simple" begin # Simple 1D gather: copy first 16 elements using gather function gather_simple_kernel(src::ct.TileArray{Float32,1}, dst::ct.TileArray{Float32,1}) From 3f249c1da2e1aa53b1f60bbefa42798ef5e9ed14 Mon Sep 17 00:00:00 2001 From: AntonOresten Date: Mon, 23 Feb 2026 14:07:24 +0100 Subject: [PATCH 4/5] Remove benchmark script --- bench/atomic_vs_store.jl | 232 --------------------------------------- 1 file changed, 232 deletions(-) delete mode 100644 bench/atomic_vs_store.jl diff --git a/bench/atomic_vs_store.jl b/bench/atomic_vs_store.jl deleted file mode 100644 index 59bcb7a..0000000 --- a/bench/atomic_vs_store.jl +++ /dev/null @@ -1,232 +0,0 @@ -#!/usr/bin/env julia -# -# Benchmark: ct.store vs ct.atomic_add (tile-space) -# -# 1D: Compares atomic_add vs store when there's no contention. -# 2D: Compares flattened 1D (N*N) vs native 2D (NxN) tile-space atomic_add. -# -# Usage: -# julia --project=test bench/atomic_vs_store.jl - -using cuTile -import cuTile as ct -using CUDA - -# --- 1D Kernels --- - -function store_1d_kernel(dst::ct.TileArray{Float32,1}, TILE::Int) - bid = ct.bid(1) - tile = ct.full((TILE,), 1.0f0, Float32) - ct.store(dst, bid, tile) - return -end - -function atomic_add_1d_kernel(dst::ct.TileArray{Float32,1}, TILE::Int) - bid = ct.bid(1) - tile = ct.full((TILE,), 1.0f0, Float32) - ct.atomic_add(dst, bid, tile) - return -end - -# --- 2D Kernels --- - -# Flat: treat N*N as a 1D array, one tile per block -function atomic_add_flat_kernel(dst::ct.TileArray{Float32,1}, TILE::Int) - bid = ct.bid(1) - tile = ct.full((TILE,), 1.0f0, Float32) - ct.atomic_add(dst, bid, tile) - return -end - -# Native 2D: NxN tile-space index -function atomic_add_2d_kernel(dst::ct.TileArray{Float32,2}, TILE_R::Int, NCOLS::Int) - bid = ct.bid(1) - # bid is linear over the 2D tile grid; convert to (row_tile, col_tile) - row = (bid - Int32(1)) ÷ NCOLS + Int32(1) - col = (bid - Int32(1)) % NCOLS + Int32(1) - tile = ct.full((TILE_R, TILE_R), 1.0f0, Float32) - ct.atomic_add(dst, (row, col), tile) - return -end - -# --- Benchmark harness --- - -function bench(f, grid, args...; warmup=5, iters=100, reset=nothing, kwargs...) - for _ in 1:warmup - reset !== nothing && reset() - ct.launch(f, grid, args...; kwargs...) - end - CUDA.synchronize() - - times = Float64[] - for _ in 1:iters - reset !== nothing && reset() - CUDA.synchronize() - t = CUDA.@elapsed begin - ct.launch(f, grid, args...; kwargs...) - end - push!(times, t) - end - - sort!(times) - trim = max(1, iters ÷ 10) - trimmed = times[trim+1:end-trim] - - return ( - median = trimmed[length(trimmed) ÷ 2] * 1e6, - mean = sum(trimmed) / length(trimmed) * 1e6, - min = times[1] * 1e6, - max = times[end] * 1e6, - ) -end - -function print_result(label, t; reference=nothing) - line = " $(rpad(label, 14))$(lpad(round(t.median, digits=2), 8))μs (min $(round(t.min, digits=2))μs)" - if reference !== nothing - ratio = t.median / reference.median - line *= " $(round(ratio, digits=2))x" - end - println(line) -end - -# --- 1D benchmark --- - -function bench_1d() - TILE = 128 - println("=" ^ 60) - println("1D: ct.store vs ct.atomic_add (no contention)") - println(" Each block writes its own tile of $TILE Float32s") - println("=" ^ 60) - println() - - for n_tiles in [64, 256, 1024, 4096, 16384, 65536] - n = n_tiles * TILE - - dst_store = CUDA.zeros(Float32, n) - dst_atomic = CUDA.zeros(Float32, n) - - # Correctness - ct.launch(store_1d_kernel, n_tiles, dst_store, ct.Constant(TILE)) - ct.launch(atomic_add_1d_kernel, n_tiles, dst_atomic, ct.Constant(TILE)) - CUDA.synchronize() - @assert all(Array(dst_store) .== 1.0f0) - @assert all(Array(dst_atomic) .== 1.0f0) - - t_store = bench(store_1d_kernel, n_tiles, dst_store, ct.Constant(TILE)) - t_atomic = bench(atomic_add_1d_kernel, n_tiles, dst_atomic, ct.Constant(TILE); - reset=() -> CUDA.fill!(dst_atomic, 0)) - - println("$(lpad(n_tiles, 4)) tiles × $TILE = $(lpad(n, 7)) elements") - print_result("store", t_store) - print_result("atomic_add", t_atomic; reference=t_store) - println() - end -end - -# --- 2D benchmark --- - -function bench_2d() - println("=" ^ 60) - println("2D: flat 1D (N*N) vs native 2D (NxN) atomic_add") - println(" Same total elements, different indexing strategies") - println("=" ^ 60) - println() - - for (tile_r, grid_r) in [(8, 8), (8, 16), (16, 16), (16, 32), (32, 32), (32, 64), (32, 128), (64, 64), (64, 128)] - n_rows = tile_r * grid_r - n_cols = n_rows - n = n_rows * n_cols - n_tiles_flat = n ÷ (tile_r * tile_r) # total tiles when flattened - n_col_tiles = n_cols ÷ tile_r - - dst_flat = CUDA.zeros(Float32, n) - dst_2d = CUDA.zeros(Float32, n_rows, n_cols) - - # Correctness - ct.launch(atomic_add_flat_kernel, n_tiles_flat, dst_flat, ct.Constant(tile_r * tile_r)) - ct.launch(atomic_add_2d_kernel, n_tiles_flat, dst_2d, - ct.Constant(tile_r), ct.Constant(n_col_tiles)) - CUDA.synchronize() - @assert all(Array(dst_flat) .== 1.0f0) "flat failed at $(n_rows)×$(n_cols)" - @assert all(Array(dst_2d) .== 1.0f0) "2D failed at $(n_rows)×$(n_cols)" - - t_flat = bench(atomic_add_flat_kernel, n_tiles_flat, dst_flat, ct.Constant(tile_r * tile_r); - reset=() -> CUDA.fill!(dst_flat, 0)) - t_2d = bench(atomic_add_2d_kernel, n_tiles_flat, dst_2d, - ct.Constant(tile_r), ct.Constant(n_col_tiles); - reset=() -> CUDA.fill!(dst_2d, 0)) - - println("$(n_rows)×$(n_cols) = $(lpad(n, 7)) elements ($(tile_r)×$(tile_r) tiles, $(n_tiles_flat) blocks)") - print_result("flat 1D", t_flat) - print_result("native 2D", t_2d; reference=t_flat) - println() - end -end - -# --- 3D trailing singleton benchmark --- - -# NxN tile in a 2D array -function atomic_add_2d_ref_kernel(dst::ct.TileArray{Float32,2}, TILE_R::Int, NCOLS::Int) - bid = ct.bid(1) - row = (bid - Int32(1)) ÷ NCOLS + Int32(1) - col = (bid - Int32(1)) % NCOLS + Int32(1) - tile = ct.full((TILE_R, TILE_R), 1.0f0, Float32) - ct.atomic_add(dst, (row, col), tile) - return -end - -# NxNx1 tile in a 3D array (trailing singleton) -function atomic_add_3d_singleton_kernel(dst::ct.TileArray{Float32,3}, TILE_R::Int, NCOLS::Int) - bid = ct.bid(1) - row = (bid - Int32(1)) ÷ NCOLS + Int32(1) - col = (bid - Int32(1)) % NCOLS + Int32(1) - tile = ct.full((TILE_R, TILE_R, 1), 1.0f0, Float32) - ct.atomic_add(dst, (row, col, Int32(1)), tile) - return -end - -function bench_trailing_singleton() - println("=" ^ 60) - println("Trailing singleton: 2D (NxN) vs 3D (NxNx1) atomic_add") - println(" Same data, extra singleton dimension in 3D") - println("=" ^ 60) - println() - - for (tile_r, grid_r) in [(8, 8), (8, 16), (16, 16), (16, 32), (32, 32), (32, 64), (32, 128), (64, 64), (64, 128)] - n_rows = tile_r * grid_r - n_cols = n_rows - n = n_rows * n_cols - n_tiles = n ÷ (tile_r * tile_r) - n_col_tiles = n_cols ÷ tile_r - - dst_2d = CUDA.zeros(Float32, n_rows, n_cols) - dst_3d = CUDA.zeros(Float32, n_rows, n_cols, 1) - - # Correctness - ct.launch(atomic_add_2d_ref_kernel, n_tiles, dst_2d, - ct.Constant(tile_r), ct.Constant(n_col_tiles)) - ct.launch(atomic_add_3d_singleton_kernel, n_tiles, dst_3d, - ct.Constant(tile_r), ct.Constant(n_col_tiles)) - CUDA.synchronize() - @assert all(Array(dst_2d) .== 1.0f0) "2D failed" - @assert all(Array(dst_3d) .== 1.0f0) "3D singleton failed" - - t_2d = bench(atomic_add_2d_ref_kernel, n_tiles, dst_2d, - ct.Constant(tile_r), ct.Constant(n_col_tiles); - reset=() -> CUDA.fill!(dst_2d, 0)) - t_3d = bench(atomic_add_3d_singleton_kernel, n_tiles, dst_3d, - ct.Constant(tile_r), ct.Constant(n_col_tiles); - reset=() -> CUDA.fill!(dst_3d, 0)) - - println("$(n_rows)×$(n_cols) = $(lpad(n, 7)) elements ($(tile_r)×$(tile_r) tiles, $(n_tiles) blocks)") - print_result("2D (NxN)", t_2d) - print_result("3D (NxNx1)", t_3d; reference=t_2d) - println() - end -end - -# --- Run --- - -bench_1d() -bench_2d() -bench_trailing_singleton() From df68a92ddc5445e6c5c82f470e790eb3573c798e Mon Sep 17 00:00:00 2001 From: AntonOresten Date: Mon, 23 Feb 2026 14:14:35 +0100 Subject: [PATCH 5/5] remove xchg and cas methods --- src/language/atomics.jl | 43 ++++++++++----------------------------- test/execution/atomics.jl | 31 ---------------------------- 2 files changed, 11 insertions(+), 63 deletions(-) diff --git a/src/language/atomics.jl b/src/language/atomics.jl index ba02e96..7abc444 100644 --- a/src/language/atomics.jl +++ b/src/language/atomics.jl @@ -167,23 +167,6 @@ for (op, intrinsic) in _ATOMIC_RMW_OPS $fname(array, (indices,), val; memory_order, memory_scope) end - # Tile-space: N-D tuple index + tile value (like store) - @eval @inline function $fname(array::TileArray{T, N}, - index::NTuple{N, Integer}, tile::Tile{T}; - memory_order::Int=MemoryOrder.AcqRel, - memory_scope::Int=MemScope.Device) where {T, N} - reshaped = _reshape_to_rank(tile, Val(N)) - ptr_tile, mask = _tile_space_ptrs_mask(array, index, Val(size(reshaped))) - Intrinsics.$intrinsic(ptr_tile, reshaped, mask, memory_order, memory_scope) - end - - # Tile-space: 1D convenience (scalar index) - @eval @inline function $fname(array::TileArray{T, 1}, - index::Integer, tile::Tile{T}; - memory_order::Int=MemoryOrder.AcqRel, - memory_scope::Int=MemScope.Device) where {T} - $fname(array, (index,), tile; memory_order, memory_scope) - end end # --- CAS operations (separate due to different signature) --- @@ -263,26 +246,22 @@ end (ptr_tile, mask) end -# --- Tile-space CAS --- +# --- Tile-space atomic_add --- -# N-D tuple index -@inline function atomic_cas(array::TileArray{T, N}, - index::NTuple{N, Integer}, - expected::Tile{T}, desired::Tile{T}; +# N-D tuple index + tile value (like store) +@inline function atomic_add(array::TileArray{T, N}, + index::NTuple{N, Integer}, tile::Tile{T}; memory_order::Int=MemoryOrder.AcqRel, memory_scope::Int=MemScope.Device) where {T, N} - expected_r = _reshape_to_rank(expected, Val(N)) - desired_r = _reshape_to_rank(desired, Val(N)) - ptr_tile, mask = _tile_space_ptrs_mask(array, index, Val(size(expected_r))) - Intrinsics.atomic_cas_tile(ptr_tile, expected_r, desired_r, mask, - memory_order, memory_scope) + reshaped = _reshape_to_rank(tile, Val(N)) + ptr_tile, mask = _tile_space_ptrs_mask(array, index, Val(size(reshaped))) + Intrinsics.atomic_add_tile(ptr_tile, reshaped, mask, memory_order, memory_scope) end -# 1D convenience -@inline function atomic_cas(array::TileArray{T, 1}, - index::Integer, - expected::Tile{T}, desired::Tile{T}; +# 1D convenience (scalar index) +@inline function atomic_add(array::TileArray{T, 1}, + index::Integer, tile::Tile{T}; memory_order::Int=MemoryOrder.AcqRel, memory_scope::Int=MemScope.Device) where {T} - atomic_cas(array, (index,), expected, desired; memory_order, memory_scope) + atomic_add(array, (index,), tile; memory_order, memory_scope) end diff --git a/test/execution/atomics.jl b/test/execution/atomics.jl index b16cba2..eebbb1e 100644 --- a/test/execution/atomics.jl +++ b/test/execution/atomics.jl @@ -369,37 +369,6 @@ end @test all(Array(arr) .== 1) # now incremented end -@testset "atomic_xchg tile-space" begin - function atomic_xchg_ts_kernel(arr::ct.TileArray{Int,1}) - bid = ct.bid(1) - tile = ct.full((16,), 42, Int) - ct.atomic_xchg(arr, bid, tile) - return - end - - arr = CUDA.zeros(Int, 32) - - ct.launch(atomic_xchg_ts_kernel, 2, arr) - - @test all(Array(arr) .== 42) -end - -@testset "atomic_cas tile-space" begin - function atomic_cas_ts_kernel(arr::ct.TileArray{Int32,1}) - bid = ct.bid(1) - expected = ct.full((16,), Int32(0), Int32) - desired = ct.full((16,), Int32(1), Int32) - ct.atomic_cas(arr, bid, expected, desired) - return - end - - arr = CUDA.zeros(Int32, 32) - - ct.launch(atomic_cas_ts_kernel, 2, arr) - - @test all(Array(arr) .== 1) -end - @testset "atomic_add tile-space 1D tuple index" begin # Test the N-D path with a 1-tuple index (not the scalar convenience) function atomic_add_ts_tuple1d_kernel(arr::ct.TileArray{Int,1})